/*
 * Decompiled with CFR 0.152.
 */
package weka.filters.unsupervised.attribute;

import java.io.File;
import java.util.Enumeration;
import java.util.Vector;
import weka.core.Capabilities;
import weka.core.DictionaryBuilder;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;
import weka.core.stemmers.NullStemmer;
import weka.core.stemmers.Stemmer;
import weka.core.stopwords.Null;
import weka.core.stopwords.StopwordsHandler;
import weka.core.tokenizers.Tokenizer;
import weka.core.tokenizers.WordTokenizer;
import weka.filters.Filter;
import weka.filters.UnsupervisedFilter;

public class StringToWordVector
extends Filter
implements UnsupervisedFilter,
OptionHandler,
WeightedInstancesHandler {
    protected DictionaryBuilder m_dictionaryBuilder = new DictionaryBuilder();
    static final long serialVersionUID = 8249106275278565424L;
    private double m_PeriodicPruningRate = -1.0;
    protected int m_filterType = 0;
    public static final int FILTER_NONE = 0;
    public static final int FILTER_NORMALIZE_ALL = 1;
    public static final int FILTER_NORMALIZE_TEST_ONLY = 2;
    public static final Tag[] TAGS_FILTER = new Tag[]{new Tag(0, "No normalization"), new Tag(1, "Normalize all data"), new Tag(2, "Normalize test data only")};
    protected File m_dictionaryFile = new File("-- set me --");
    protected boolean m_dictionaryIsBinary;

    public StringToWordVector() {
    }

    @Override
    public Enumeration<Option> listOptions() {
        Vector<Option> result = new Vector<Option>();
        result.addElement(new Option("\tOutput word counts rather than boolean word presence.\n", "C", 0, "-C"));
        result.addElement(new Option("\tSpecify list of string attributes to convert to words (as weka Range).\n\t(default: select all string attributes)", "R", 1, "-R <index1,index2-index4,...>"));
        result.addElement(new Option("\tInvert matching sense of column indexes.", "V", 0, "-V"));
        result.addElement(new Option("\tSpecify a prefix for the created attribute names.\n\t(default: \"\")", "P", 1, "-P <attribute name prefix>"));
        result.addElement(new Option("\tSpecify approximate number of word fields to create.\n\tSurplus words will be discarded..\n\t(default: 1000)", "W", 1, "-W <number of words to keep>"));
        result.addElement(new Option("\tSpecify the rate (e.g., every 10% of the input dataset) at which to periodically prune the dictionary.\n\t-W prunes after creating a full dictionary. You may not have enough memory for this approach.\n\t(default: no periodic pruning)", "prune-rate", 1, "-prune-rate <rate as a percentage of dataset>"));
        result.addElement(new Option("\tTransform the word frequencies into log(1+fij)\n\twhere fij is the frequency of word i in jth document(instance).\n", "T", 0, "-T"));
        result.addElement(new Option("\tTransform each word frequency into:\n\tfij*log(num of Documents/num of documents containing word i)\n\t  where fij if frequency of word i in jth document(instance)", "I", 0, "-I"));
        result.addElement(new Option("\tWhether to 0=not normalize/1=normalize all data/2=normalize test data only\n\tto average length of training documents (default 0=don't normalize).", "N", 1, "-N"));
        result.addElement(new Option("\tConvert all tokens to lowercase before adding to the dictionary.", "L", 0, "-L"));
        result.addElement(new Option("\tThe stopwords handler to use (default Null).", "-stopwords-handler", 1, "-stopwords-handler"));
        result.addElement(new Option("\tThe stemming algorithm (classname plus parameters) to use.", "stemmer", 1, "-stemmer <spec>"));
        result.addElement(new Option("\tThe minimum term frequency (default = 1).", "M", 1, "-M <int>"));
        result.addElement(new Option("\tIf this is set, the maximum number of words and the \n\tminimum term frequency is not enforced on a per-class \n\tbasis but based on the documents in all the classes \n\t(even if a class attribute is set).", "O", 0, "-O"));
        result.addElement(new Option("\tThe tokenizing algorithm (classname plus parameters) to use.\n\t(default: " + WordTokenizer.class.getName() + ")", "tokenizer", 1, "-tokenizer <spec>"));
        result.addElement(new Option("\tThe file to save the dictionary to.\n\t(default is not to save the dictionary)", "dictionary", 1, "-dictionary <path to save to>"));
        result.addElement(new Option("\tSave the dictionary file as a binary serialized object\n\tinstead of in plain text form. Use in conjunction with\n\t-dictionary", "binary-dict", 0, "-binary-dict"));
        return result.elements();
    }

    @Override
    public void setOptions(String[] options) throws Exception {
        String value = Utils.getOption('R', options);
        if (value.length() != 0) {
            this.setSelectedRange(value);
        } else {
            this.setSelectedRange("first-last");
        }
        this.setInvertSelection(Utils.getFlag('V', options));
        value = Utils.getOption('P', options);
        if (value.length() != 0) {
            this.setAttributeNamePrefix(value);
        } else {
            this.setAttributeNamePrefix("");
        }
        value = Utils.getOption('W', options);
        if (value.length() != 0) {
            this.setWordsToKeep(Integer.valueOf(value));
        } else {
            this.setWordsToKeep(1000);
        }
        value = Utils.getOption("prune-rate", options);
        if (value.length() > 0) {
            this.setPeriodicPruning(Double.parseDouble(value));
        } else {
            this.setPeriodicPruning(-1.0);
        }
        value = Utils.getOption('M', options);
        if (value.length() != 0) {
            this.setMinTermFreq(Integer.valueOf(value));
        } else {
            this.setMinTermFreq(1);
        }
        this.setOutputWordCounts(Utils.getFlag('C', options));
        this.setTFTransform(Utils.getFlag('T', options));
        this.setIDFTransform(Utils.getFlag('I', options));
        this.setDoNotOperateOnPerClassBasis(Utils.getFlag('O', options));
        String nString = Utils.getOption('N', options);
        if (nString.length() != 0) {
            this.setNormalizeDocLength(new SelectedTag(Integer.parseInt(nString), TAGS_FILTER));
        } else {
            this.setNormalizeDocLength(new SelectedTag(0, TAGS_FILTER));
        }
        this.setLowerCaseTokens(Utils.getFlag('L', options));
        String stemmerString = Utils.getOption("stemmer", options);
        if (stemmerString.length() == 0) {
            this.setStemmer(null);
        } else {
            String[] stemmerSpec = Utils.splitOptions(stemmerString);
            if (stemmerSpec.length == 0) {
                throw new Exception("Invalid stemmer specification string");
            }
            String stemmerName = stemmerSpec[0];
            stemmerSpec[0] = "";
            Stemmer stemmer = (Stemmer)Utils.forName(Class.forName("weka.core.stemmers.Stemmer"), stemmerName, stemmerSpec);
            this.setStemmer(stemmer);
        }
        String stopwordsHandlerString = Utils.getOption("stopwords-handler", options);
        if (stopwordsHandlerString.length() == 0) {
            this.setStopwordsHandler(null);
        } else {
            String[] stopwordsHandlerSpec = Utils.splitOptions(stopwordsHandlerString);
            if (stopwordsHandlerSpec.length == 0) {
                throw new Exception("Invalid StopwordsHandler specification string");
            }
            String stopwordsHandlerName = stopwordsHandlerSpec[0];
            stopwordsHandlerSpec[0] = "";
            StopwordsHandler stopwordsHandler = (StopwordsHandler)Utils.forName(Class.forName("weka.core.stopwords.StopwordsHandler"), stopwordsHandlerName, stopwordsHandlerSpec);
            this.setStopwordsHandler(stopwordsHandler);
        }
        String tokenizerString = Utils.getOption("tokenizer", options);
        if (tokenizerString.length() == 0) {
            this.setTokenizer(new WordTokenizer());
        } else {
            String[] tokenizerSpec = Utils.splitOptions(tokenizerString);
            if (tokenizerSpec.length == 0) {
                throw new Exception("Invalid tokenizer specification string");
            }
            String tokenizerName = tokenizerSpec[0];
            tokenizerSpec[0] = "";
            Tokenizer tokenizer = (Tokenizer)Utils.forName(Class.forName("weka.core.tokenizers.Tokenizer"), tokenizerName, tokenizerSpec);
            this.setTokenizer(tokenizer);
        }
        String dictFile = Utils.getOption("dictionary", options);
        this.setDictionaryFileToSaveTo(new File(dictFile));
        this.setSaveDictionaryInBinaryForm(Utils.getFlag("binary-dict", options));
        Utils.checkForRemainingOptions(options);
    }

    @Override
    public String[] getOptions() {
        String spec;
        Vector<String> result = new Vector<String>();
        result.add("-R");
        result.add(this.getSelectedRange().getRanges());
        if (this.getInvertSelection()) {
            result.add("-V");
        }
        if (!"".equals(this.getAttributeNamePrefix())) {
            result.add("-P");
            result.add(this.getAttributeNamePrefix());
        }
        result.add("-W");
        result.add(String.valueOf(this.getWordsToKeep()));
        result.add("-prune-rate");
        result.add(String.valueOf(this.getPeriodicPruning()));
        if (this.getOutputWordCounts()) {
            result.add("-C");
        }
        if (this.getTFTransform()) {
            result.add("-T");
        }
        if (this.getIDFTransform()) {
            result.add("-I");
        }
        result.add("-N");
        result.add("" + this.m_filterType);
        if (this.getLowerCaseTokens()) {
            result.add("-L");
        }
        if (this.getStemmer() != null) {
            result.add("-stemmer");
            spec = this.getStemmer().getClass().getName();
            if (this.getStemmer() instanceof OptionHandler) {
                spec = spec + " " + Utils.joinOptions(((OptionHandler)((Object)this.getStemmer())).getOptions());
            }
            result.add(spec.trim());
        }
        if (this.getStopwordsHandler() != null) {
            result.add("-stopwords-handler");
            spec = this.getStopwordsHandler().getClass().getName();
            if (this.getStopwordsHandler() instanceof OptionHandler) {
                spec = spec + " " + Utils.joinOptions(((OptionHandler)((Object)this.getStopwordsHandler())).getOptions());
            }
            result.add(spec.trim());
        }
        result.add("-M");
        result.add(String.valueOf(this.getMinTermFreq()));
        if (this.getDoNotOperateOnPerClassBasis()) {
            result.add("-O");
        }
        result.add("-tokenizer");
        spec = this.getTokenizer().getClass().getName();
        if (this.getTokenizer() instanceof OptionHandler) {
            spec = spec + " " + Utils.joinOptions(this.getTokenizer().getOptions());
        }
        result.add(spec.trim());
        if (this.m_dictionaryFile != null && this.m_dictionaryFile.toString().length() > 0 && !this.m_dictionaryFile.toString().equalsIgnoreCase("-- set me --")) {
            result.add("-dictionary");
            result.add(this.m_dictionaryFile.toString());
            if (this.getSaveDictionaryInBinaryForm()) {
                result.add("-binary-dict");
            }
        }
        return result.toArray(new String[result.size()]);
    }

    public StringToWordVector(int wordsToKeep) {
        this.m_dictionaryBuilder.setWordsToKeep(wordsToKeep);
    }

    @Override
    public Capabilities getCapabilities() {
        Capabilities result = super.getCapabilities();
        result.disableAll();
        result.enableAllAttributes();
        result.enable(Capabilities.Capability.MISSING_VALUES);
        result.enableAllClasses();
        result.enable(Capabilities.Capability.MISSING_CLASS_VALUES);
        result.enable(Capabilities.Capability.NO_CLASS);
        return result;
    }

    @Override
    public boolean setInputFormat(Instances instanceInfo) throws Exception {
        super.setInputFormat(instanceInfo);
        this.m_dictionaryBuilder.reset();
        this.m_dictionaryBuilder.setSortDictionary(true);
        this.m_dictionaryBuilder.setNormalize(false);
        this.m_dictionaryBuilder.setup(instanceInfo);
        return false;
    }

    @Override
    public boolean input(Instance instance) throws Exception {
        if (this.getInputFormat() == null) {
            throw new IllegalStateException("No input instance format defined");
        }
        if (this.m_NewBatch) {
            this.resetQueue();
            this.m_NewBatch = false;
        }
        if (this.isFirstBatchDone()) {
            Instance inst = this.m_dictionaryBuilder.vectorizeInstance(instance);
            this.push(inst, false);
            return true;
        }
        this.bufferInput(instance);
        return false;
    }

    @Override
    public boolean batchFinished() throws Exception {
        if (this.getInputFormat() == null) {
            throw new IllegalStateException("No input instance format defined");
        }
        if (!this.isFirstBatchDone()) {
            long pruneRate = Math.round(this.m_PeriodicPruningRate / 100.0 * (double)this.getInputFormat().numInstances());
            this.m_dictionaryBuilder.setPeriodicPruning(pruneRate);
            for (int i = 0; i < this.getInputFormat().numInstances(); ++i) {
                Instance toProcess = this.getInputFormat().instance(i);
                this.m_dictionaryBuilder.processInstance(toProcess);
            }
            this.m_dictionaryBuilder.finalizeDictionary();
            this.setOutputFormat(this.m_dictionaryBuilder.getVectorizedFormat());
            this.m_dictionaryBuilder.setNormalize(this.m_filterType != 0);
            Instances converted = this.m_dictionaryBuilder.vectorizeBatch(this.getInputFormat(), this.m_filterType != 0);
            if (this.m_dictionaryFile != null && this.m_dictionaryFile.toString().length() > 0 && !this.m_dictionaryFile.toString().equalsIgnoreCase("-- set me --")) {
                this.m_dictionaryBuilder.saveDictionary(this.m_dictionaryFile, !this.m_dictionaryIsBinary);
            }
            for (int i = 0; i < converted.numInstances(); ++i) {
                this.push(converted.instance(i), false);
            }
        }
        this.flushInput();
        this.m_NewBatch = true;
        this.m_FirstBatchDone = true;
        return this.numPendingOutput() != 0;
    }

    public String dictionaryFileToSaveToTipText() {
        return "The path to save the dictionary file to - an empty path or a path '-- set me --' means do not save the dictionary.";
    }

    public void setDictionaryFileToSaveTo(File toSaveTo) {
        this.m_dictionaryFile = toSaveTo;
    }

    public File getDictionaryFileToSaveTo() {
        return this.m_dictionaryFile;
    }

    public String saveDictionaryInBinaryFormTipText() {
        return "Save the dictionary as a binary serialized java object instead of in plain text form.";
    }

    public void setSaveDictionaryInBinaryForm(boolean saveAsBinary) {
        this.m_dictionaryIsBinary = saveAsBinary;
    }

    public boolean getSaveDictionaryInBinaryForm() {
        return this.m_dictionaryIsBinary;
    }

    public String globalInfo() {
        return "Converts string attributes into a set of numeric attributes representing word occurrence information from the text contained in the strings. The dictionary is determined from the first batch of data filtered (typically training data). Note that this filter is not strictly unsupervised when a class attribute is set because it creates a separate dictionary for each class and then merges them.";
    }

    public boolean getOutputWordCounts() {
        return this.m_dictionaryBuilder.getOutputWordCounts();
    }

    public void setOutputWordCounts(boolean outputWordCounts) {
        this.m_dictionaryBuilder.setOutputWordCounts(outputWordCounts);
    }

    public String outputWordCountsTipText() {
        return "Output word counts rather than boolean 0 or 1(indicating presence or absence of a word).";
    }

    public Range getSelectedRange() {
        return this.m_dictionaryBuilder.getSelectedRange();
    }

    public void setSelectedRange(String newSelectedRange) {
        this.m_dictionaryBuilder.setSelectedRange(newSelectedRange);
    }

    public String attributeIndicesTipText() {
        return "Specify range of attributes to act on. This is a comma separated list of attribute indices, with \"first\" and \"last\" valid values. Specify an inclusive range with \"-\". E.g: \"first-3,5,6-10,last\".";
    }

    public String getAttributeIndices() {
        return this.m_dictionaryBuilder.getAttributeIndices();
    }

    public void setAttributeIndices(String rangeList) {
        this.m_dictionaryBuilder.setAttributeIndices(rangeList);
    }

    public void setAttributeIndicesArray(int[] attributes) {
        this.m_dictionaryBuilder.setAttributeIndicesArray(attributes);
    }

    public String invertSelectionTipText() {
        return "Set attribute selection mode. If false, only selected attributes in the range will be worked on; if true, only non-selected attributes will be processed.";
    }

    public boolean getInvertSelection() {
        return this.m_dictionaryBuilder.getInvertSelection();
    }

    public void setInvertSelection(boolean invert) {
        this.m_dictionaryBuilder.setInvertSelection(invert);
    }

    public String getAttributeNamePrefix() {
        return this.m_dictionaryBuilder.getAttributeNamePrefix();
    }

    public void setAttributeNamePrefix(String newPrefix) {
        this.m_dictionaryBuilder.setAttributeNamePrefix(newPrefix);
    }

    public String attributeNamePrefixTipText() {
        return "Prefix for the created attribute names. (default: \"\")";
    }

    public int getWordsToKeep() {
        return this.m_dictionaryBuilder.getWordsToKeep();
    }

    public void setWordsToKeep(int newWordsToKeep) {
        this.m_dictionaryBuilder.setWordsToKeep(newWordsToKeep);
    }

    public String wordsToKeepTipText() {
        return "The number of words (per class if there is a class attribute assigned) to attempt to keep.";
    }

    public double getPeriodicPruning() {
        return this.m_PeriodicPruningRate;
    }

    public void setPeriodicPruning(double newPeriodicPruning) {
        this.m_PeriodicPruningRate = newPeriodicPruning;
    }

    public String periodicPruningTipText() {
        return "Specify the rate (x% of the input dataset) at which to periodically prune the dictionary. wordsToKeep prunes after creating a full dictionary. You may not have enough memory for this approach.";
    }

    public boolean getTFTransform() {
        return this.m_dictionaryBuilder.getTFTransform();
    }

    public void setTFTransform(boolean TFTransform) {
        this.m_dictionaryBuilder.setTFTransform(TFTransform);
    }

    public String TFTransformTipText() {
        return "Sets whether if the word frequencies should be transformed into  log(1+fij) where fij is the frequency of word i in document (instance) j.";
    }

    public boolean getIDFTransform() {
        return this.m_dictionaryBuilder.getIDFTransform();
    }

    public void setIDFTransform(boolean IDFTransform) {
        this.m_dictionaryBuilder.setIDFTransform(IDFTransform);
    }

    public String IDFTransformTipText() {
        return "Sets whether if the word frequencies in a document should be transformed into: \n   fij*log(num of Docs/num of Docs with word i) \n      where fij is the frequency of word i in document (instance) j.";
    }

    public SelectedTag getNormalizeDocLength() {
        return new SelectedTag(this.m_filterType, TAGS_FILTER);
    }

    public void setNormalizeDocLength(SelectedTag newType) {
        if (newType.getTags() == TAGS_FILTER) {
            this.m_filterType = newType.getSelectedTag().getID();
        }
    }

    public String normalizeDocLengthTipText() {
        return "Sets whether if the word frequencies for a document (instance) should be normalized or not.";
    }

    public boolean getLowerCaseTokens() {
        return this.m_dictionaryBuilder.getLowerCaseTokens();
    }

    public void setLowerCaseTokens(boolean downCaseTokens) {
        this.m_dictionaryBuilder.setLowerCaseTokens(downCaseTokens);
    }

    public String doNotOperateOnPerClassBasisTipText() {
        return "If this is set, the maximum number of words and the minimum term frequency is not enforced on a per-class basis but based on the documents in all the classes (even if a class attribute is set).";
    }

    public boolean getDoNotOperateOnPerClassBasis() {
        return this.m_dictionaryBuilder.getDoNotOperateOnPerClassBasis();
    }

    public void setDoNotOperateOnPerClassBasis(boolean newDoNotOperateOnPerClassBasis) {
        this.m_dictionaryBuilder.setDoNotOperateOnPerClassBasis(newDoNotOperateOnPerClassBasis);
    }

    public String minTermFreqTipText() {
        return "Sets the minimum term frequency. This is enforced on a per-class basis.";
    }

    public int getMinTermFreq() {
        return this.m_dictionaryBuilder.getMinTermFreq();
    }

    public void setMinTermFreq(int newMinTermFreq) {
        this.m_dictionaryBuilder.setMinTermFreq(newMinTermFreq);
    }

    public String lowerCaseTokensTipText() {
        return "If set then all the word tokens are converted to lower case before being added to the dictionary.";
    }

    public void setStemmer(Stemmer value) {
        if (value != null) {
            this.m_dictionaryBuilder.setStemmer(value);
        } else {
            this.m_dictionaryBuilder.setStemmer(new NullStemmer());
        }
    }

    public Stemmer getStemmer() {
        return this.m_dictionaryBuilder.getStemmer();
    }

    public String stemmerTipText() {
        return "The stemming algorithm to use on the words.";
    }

    public void setStopwordsHandler(StopwordsHandler value) {
        if (value != null) {
            this.m_dictionaryBuilder.setStopwordsHandler(value);
        } else {
            this.m_dictionaryBuilder.setStopwordsHandler(new Null());
        }
    }

    public StopwordsHandler getStopwordsHandler() {
        return this.m_dictionaryBuilder.getStopwordsHandler();
    }

    public String stopwordsHandlerTipText() {
        return "The stopwords handler to use (Null means no stopwords are used).";
    }

    public void setTokenizer(Tokenizer value) {
        this.m_dictionaryBuilder.setTokenizer(value);
    }

    public Tokenizer getTokenizer() {
        return this.m_dictionaryBuilder.getTokenizer();
    }

    public String tokenizerTipText() {
        return "The tokenizing algorithm to use on the strings.";
    }

    @Override
    public String getRevision() {
        return RevisionUtils.extract("$Revision$");
    }

    public static void main(String[] argv) {
        StringToWordVector.runFilter(new StringToWordVector(), argv);
    }
}

