/*
 * Decompiled with CFR 0.152.
 */
package com.rapidminer.operator;

import com.rapidminer.ObjectVisualizer;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.AttributeWeights;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.operator.ClassNameMapper;
import com.rapidminer.operator.ExampleTableOutputFilter;
import com.rapidminer.operator.IOContainer;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.MissingIOObjectException;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorChain;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.RapidMinerOutputFilter;
import com.rapidminer.operator.TextVisualizer;
import com.rapidminer.operator.TokenSequence;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.WVToolRapidMinerLogger;
import com.rapidminer.operator.WordList;
import com.rapidminer.operator.condition.InnerOperatorCondition;
import com.rapidminer.operator.condition.LastInnerOperatorCondition;
import com.rapidminer.operator.extraction.ExtractingInputFilter;
import com.rapidminer.operator.extraction.ExtractionException;
import com.rapidminer.operator.extraction.TextExtractor;
import com.rapidminer.operator.extraction.util.FeatureExtractionUtil;
import com.rapidminer.parameter.ParameterHandler;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeFile;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.ParameterTypeString;
import com.rapidminer.parameter.ParameterTypeStringCategory;
import com.rapidminer.parameter.UndefinedParameterError;
import com.rapidminer.parameter.conditions.BooleanParameterCondition;
import com.rapidminer.parameter.conditions.ParameterCondition;
import com.rapidminer.tools.LoggingHandler;
import com.rapidminer.tools.ObjectVisualizerService;
import edu.udo.cs.wvtool.config.WVTConfiguration;
import edu.udo.cs.wvtool.config.WVTConfigurationFact;
import edu.udo.cs.wvtool.config.WVTConfigurationRule;
import edu.udo.cs.wvtool.generic.vectorcreation.BinaryOccurrences;
import edu.udo.cs.wvtool.generic.vectorcreation.TFIDF;
import edu.udo.cs.wvtool.generic.vectorcreation.TermFrequency;
import edu.udo.cs.wvtool.generic.vectorcreation.TermOccurrences;
import edu.udo.cs.wvtool.generic.vectorcreation.WVTVectorCreator;
import edu.udo.cs.wvtool.main.WVTDocumentInfo;
import edu.udo.cs.wvtool.main.WVTInputList;
import edu.udo.cs.wvtool.main.WVTTokenSequence;
import edu.udo.cs.wvtool.main.WVTool;
import edu.udo.cs.wvtool.util.WVToolException;
import edu.udo.cs.wvtool.util.WVToolLogger;
import edu.udo.cs.wvtool.wordlist.WVTWordList;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Reader;
import java.io.Writer;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public abstract class TextInput
extends OperatorChain {
    public static final String PARAMETER_DEFAULT_CONTENT_TYPE = "default_content_type";
    public static final String PARAMETER_DEFAULT_CONTENT_ENCODING = "default_content_encoding";
    public static final String PARAMETER_DEFAULT_CONTENT_LANGUAGE = "default_content_language";
    public static final String PARAMETER_PRUNE_BELOW = "prune_below";
    public static final String PARAMETER_PRUNE_ABOVE = "prune_above";
    public static final String PARAMETER_VECTOR_CREATION = "vector_creation";
    public static final String PARAMETER_USE_CONTENT_ATTRIBUTES = "use_content_attributes";
    public static final String PARAMETER_USE_GIVEN_WORD_LIST = "use_given_word_list";
    public static final String PARAMETER_INPUT_WORD_LIST = "input_word_list";
    public static final String PARAMETER_RETURN_WORD_LIST = "return_word_list";
    public static final String PARAMETER_OUTPUT_WORD_LIST = "output_word_list";
    public static final String PARAMETER_ID_ATTRIBUTE_TYPE = "id_attribute_type";
    public static final String PARAMETER_TEXT_QUERY = "text_query";
    public static final String PARAMETER_CREATE_TEXT_VISUALIZER = "create_text_visualizer";
    public static final String PARAMETER_ON_THE_FLY_PRUNING = "on_the_fly_pruning";
    public static Class<?>[] vectorCreation = new Class[]{TermFrequency.class, TermOccurrences.class, TFIDF.class, BinaryOccurrences.class};
    private ClassNameMapper vectorCreationMapper;
    protected ExampleSet inputExampleSet = null;
    private WVTool wvt = new WVTool(false);
    private WVTConfiguration config;

    public TextInput(OperatorDescription description) {
        super(description);
    }

    protected abstract WVTInputList createInputList() throws OperatorException;

    protected abstract Attribute getLabel() throws OperatorException;

    protected abstract WVTConfiguration createConfiguration() throws OperatorException;

    protected abstract RapidMinerOutputFilter getOutputFilter(WVTWordList var1, Attribute var2) throws OperatorException;

    private WVTWordList createWordList(WVTInputList inputList, List<?> words, boolean append, int pruneFrequency) throws WVToolException, OperatorException {
        WVTWordList wordList = null;
        wordList = words != null ? new WVTWordList(words, inputList.getNumClasses()) : new WVTWordList(inputList.getNumClasses());
        int counter = 1;
        Iterator it = inputList.getEntries();
        while (it.hasNext()) {
            WVTDocumentInfo docInfo = (WVTDocumentInfo)it.next();
            WVTTokenSequence tokens = this.getTokenSequence(docInfo);
            this.wvt.addToWordList(wordList, tokens, append);
            if (pruneFrequency > 0 && counter % pruneFrequency == 0) {
                wordList.pruneByFrequency(2, Integer.MAX_VALUE);
            }
            ++counter;
        }
        if (counter <= 1) {
            throw new UserError((Operator)this, 932);
        }
        return wordList;
    }

    private WVTTokenSequence getTokenSequence(WVTDocumentInfo docInfo) throws OperatorException {
        TokenSequence result = null;
        try {
            BufferedReader in = new BufferedReader(this.wvt.getReader(docInfo, this.config));
            StringBuffer buffer = new StringBuffer();
            String line = null;
            while ((line = in.readLine()) != null) {
                buffer.append(line);
                buffer.append(' ');
            }
            in.close();
            TokenSequence sequence = new TokenSequence(buffer.toString(), docInfo);
            IOContainer container = new IOContainer(new IOObject[]{sequence});
            int i = 0;
            while (i < this.getNumberOfOperators()) {
                container = this.getOperator(i).apply(container);
                ++i;
            }
            result = (TokenSequence)container.get(TokenSequence.class);
        }
        catch (MissingIOObjectException e) {
            throw new UserError((Operator)this, (Throwable)e, 127, new Object[]{e});
        }
        catch (IOException e) {
            throw new UserError((Operator)this, (Throwable)e, 302, new Object[]{docInfo.getSourceName(), e});
        }
        catch (WVToolException e) {
            throw new UserError((Operator)this, (Throwable)e, 905, new Object[]{"wvtool", e});
        }
        return result;
    }

    private void pruneWordList(WVTWordList wordList) throws UndefinedParameterError {
        double rankFraction;
        String pruneBelowStr = this.getParameter(PARAMETER_PRUNE_BELOW);
        String pruneAboveStr = this.getParameter(PARAMETER_PRUNE_ABOVE);
        int pruneBelow = -1;
        int pruneAbove = -1;
        try {
            if (pruneBelowStr.charAt(pruneBelowStr.length() - 1) == '%') {
                pruneBelowStr = pruneBelowStr.substring(0, pruneBelowStr.length() - 1);
                rankFraction = 1.0 - Double.parseDouble(pruneBelowStr) / 100.0;
                pruneBelow = wordList.getFrequencyByRank((int)Math.max((double)wordList.getNumWords() * rankFraction, 1.0));
            } else {
                pruneBelow = Integer.parseInt(pruneBelowStr);
            }
        }
        catch (NumberFormatException e) {
            this.logError("Could not parse the parameter prune_below: " + e.getMessage());
        }
        try {
            if (pruneAboveStr.charAt(pruneAboveStr.length() - 1) == '%') {
                pruneAboveStr = pruneAboveStr.substring(0, pruneAboveStr.length() - 1);
                rankFraction = Double.parseDouble(pruneAboveStr) / 100.0;
                pruneAbove = wordList.getFrequencyByRank((int)Math.max((double)wordList.getNumWords() * rankFraction, 1.0));
            } else {
                pruneAbove = Integer.parseInt(pruneAboveStr);
            }
        }
        catch (NumberFormatException e) {
            this.logError("Could not parse the parameter prune_above: " + e.getMessage());
        }
        if (pruneBelow >= 0 || pruneAbove >= 0) {
            this.log("Pruning word list.");
            wordList.pruneByFrequency(pruneBelow < 0 ? 0 : pruneBelow, pruneAbove < 0 ? Integer.MAX_VALUE : pruneAbove);
        }
    }

    public IOObject[] apply() throws OperatorException {
        String textExtractorStr;
        this.config = this.createConfiguration();
        WVToolLogger.setGlobalLogger((WVToolLogger)new WVToolRapidMinerLogger((LoggingHandler)this));
        if (this.getNumberOfOperators() == 0) {
            this.logWarning("There are no suboperators for this operator. This is usually not intended. You would probably like to at least add a tokenizer");
        }
        if (this.getInput().contains(ExampleSet.class)) {
            this.inputExampleSet = (ExampleSet)this.getInput(ExampleSet.class);
        }
        if ((textExtractorStr = this.getParameterAsString(PARAMETER_TEXT_QUERY)) != null) {
            TextExtractor textExtractor = null;
            try {
                textExtractor = FeatureExtractionUtil.getExtractor(textExtractorStr, FeatureExtractionUtil.getNamespaceMapping(this.getParameters()));
            }
            catch (ExtractionException e) {
                UserError error = e.getUserError();
                error.setOperator((Operator)this);
                throw error;
            }
            ExtractingInputFilter extrInFilter = new ExtractingInputFilter(this.config, textExtractor);
            this.config.setConfigurationRule("inputfilter", (WVTConfigurationRule)new WVTConfigurationFact((Object)extrInFilter));
        }
        WVTVectorCreator vectorCreator = (WVTVectorCreator)this.vectorCreationMapper.getInstantiation(this.getParameterAsString(PARAMETER_VECTOR_CREATION));
        WVTInputList list = this.createInputList();
        try {
            boolean userWordListMode = true;
            AttributeWeights weights = null;
            try {
                weights = (AttributeWeights)this.getInput(AttributeWeights.class);
            }
            catch (Exception e) {
                userWordListMode = false;
                weights = null;
            }
            WVTWordList wordList = null;
            if (this.getParameterAsBoolean(PARAMETER_USE_GIVEN_WORD_LIST)) {
                wordList = ((WordList)this.getInput(WordList.class)).getWordList();
            } else if (this.isParameterSet(PARAMETER_INPUT_WORD_LIST)) {
                if (userWordListMode) {
                    this.logWarning("Input attribute weights are ignored for word list loaded from file.");
                }
                File wordListFile = this.getParameterAsFile(PARAMETER_INPUT_WORD_LIST);
                try {
                    BufferedReader wordListIn = new BufferedReader(new FileReader(wordListFile));
                    wordList = new WVTWordList((Reader)wordListIn);
                }
                catch (IOException e) {
                    throw new UserError((Operator)this, 302, new Object[]{wordListFile, e.getMessage()});
                }
            } else if (userWordListMode) {
                LinkedList<String> userWordList = new LinkedList<String>();
                LinkedList attributeNames = new LinkedList(weights.getAttributeNames());
                int j = 0;
                while (j < attributeNames.size()) {
                    String attributeName = (String)attributeNames.get(j);
                    double attributeWeight = weights.getWeight(attributeName);
                    if (attributeWeight > 0.0) {
                        userWordList.add(attributeName);
                    }
                    ++j;
                }
                wordList = this.createWordList(list, userWordList, false, this.getParameterAsInt(PARAMETER_ON_THE_FLY_PRUNING));
            } else {
                wordList = this.createWordList(list, null, true, this.getParameterAsInt(PARAMETER_ON_THE_FLY_PRUNING));
            }
            if (!this.isParameterSet(PARAMETER_INPUT_WORD_LIST) && !this.getParameterAsBoolean(PARAMETER_USE_GIVEN_WORD_LIST)) {
                this.pruneWordList(wordList);
            } else {
                this.log("Using external wordlist, no pruning is performed");
            }
            RapidMinerOutputFilter out = this.getOutputFilter(wordList, this.getLabel());
            Iterator it = list.getEntries();
            while (it.hasNext()) {
                WVTDocumentInfo docInfo = (WVTDocumentInfo)it.next();
                WVTTokenSequence tokens = this.getTokenSequence(docInfo);
                out.write(this.wvt.createVector(tokens, vectorCreator, wordList));
            }
            if (this.getParameterAsBoolean(PARAMETER_CREATE_TEXT_VISUALIZER)) {
                TextVisualizer textVis = new TextVisualizer(list, this.config, this.getParameterAsInt(PARAMETER_ID_ATTRIBUTE_TYPE));
                ObjectVisualizerService.addObjectVisualizer((ObjectVisualizer)textVis);
            }
            if (this.isParameterSet(PARAMETER_OUTPUT_WORD_LIST)) {
                File wordListFile = this.getParameterAsFile(PARAMETER_OUTPUT_WORD_LIST, true);
                try {
                    FileWriter wordListOut = new FileWriter(wordListFile);
                    wordList.store((Writer)wordListOut);
                }
                catch (IOException e) {
                    throw new UserError((Operator)this, 303, new Object[]{wordListFile, e.getMessage()});
                }
            }
            ExampleSet exampleSet = out.createExampleSet();
            out.cleanUp();
            if (this.getParameterAsBoolean(PARAMETER_RETURN_WORD_LIST)) {
                return new IOObject[]{exampleSet, new WordList(wordList)};
            }
            return new IOObject[]{exampleSet};
        }
        catch (WVToolException e) {
            throw new UserError((Operator)this, (Throwable)e, 905, new Object[]{"wvtool", e});
        }
    }

    public Class<?>[] getInputClasses() {
        return new Class[0];
    }

    public Class<?>[] getOutputClasses() {
        return new Class[]{ExampleSet.class, WordList.class};
    }

    public InnerOperatorCondition getInnerOperatorCondition() {
        return new LastInnerOperatorCondition(new Class[]{TokenSequence.class}, new Class[]{TokenSequence.class}, true);
    }

    public int getMaxNumberOfInnerOperators() {
        return Integer.MAX_VALUE;
    }

    public int getMinNumberOfInnerOperators() {
        return 0;
    }

    public List<ParameterType> getParameterTypes() {
        List types = super.getParameterTypes();
        types.add(new ParameterTypeString(PARAMETER_DEFAULT_CONTENT_TYPE, "The default content type if not specified by the example set  (possible values: pdf, html, htm, xml, text, txt).", ""));
        types.add(new ParameterTypeString(PARAMETER_DEFAULT_CONTENT_ENCODING, "The default content encoding if not specified by the example set (only encodings supported by Java can be used).", ""));
        types.add(new ParameterTypeString(PARAMETER_DEFAULT_CONTENT_LANGUAGE, "The default content language if not specified by the example set.", ""));
        types.add(new ParameterTypeString(PARAMETER_PRUNE_BELOW, "Prune words that appear inat most that many documents. -1 for no pruning. Alternatively you can provide a percentage value, denoting the lowest document frequency in p words with the highest frequency.", "-1"));
        types.add(new ParameterTypeString(PARAMETER_PRUNE_ABOVE, "Prune words that appear in at least that many documents. -1 for no pruning. Alternatively you can provide a percentage value, denoting the highest document frequency in p words with the lowest frequency.", "-1"));
        String[] vectorizationClassNames = new String[vectorCreation.length];
        int i = 0;
        while (i < vectorCreation.length) {
            vectorizationClassNames[i] = vectorCreation[i].getCanonicalName();
            ++i;
        }
        this.vectorCreationMapper = new ClassNameMapper(vectorizationClassNames);
        ParameterTypeStringCategory type = new ParameterTypeStringCategory(PARAMETER_VECTOR_CREATION, "Method used to create word vectors", this.vectorCreationMapper.getShortClassNames(), "TFIDF");
        type.setExpert(false);
        types.add(type);
        types.add(new ParameterTypeBoolean(PARAMETER_USE_CONTENT_ATTRIBUTES, "If set to true, the returned example set will contain content type, encoding, and language attributes.", false));
        types.add(new ParameterTypeBoolean(PARAMETER_USE_GIVEN_WORD_LIST, "If set, the given word of list in the input will be used", false));
        type = new ParameterTypeFile(PARAMETER_INPUT_WORD_LIST, "Load a word list from this file instead of creating it from the input data.", null, true);
        type.registerDependencyCondition((ParameterCondition)new BooleanParameterCondition((ParameterHandler)this, PARAMETER_USE_GIVEN_WORD_LIST, false, false));
        types.add(type);
        types.add(new ParameterTypeBoolean(PARAMETER_RETURN_WORD_LIST, "If checked the word list will be returned as part of the result.", false));
        types.add(new ParameterTypeFile(PARAMETER_OUTPUT_WORD_LIST, "Save the used word list into this file.", null, true));
        types.add(new ParameterTypeCategory(PARAMETER_ID_ATTRIBUTE_TYPE, "Indicates if long ids (complete paths), short ids (last part of the source name), or numerical ids will be used.", ExampleTableOutputFilter.ID_TYPE_NAMES, 2));
        ParameterType param = FeatureExtractionUtil.createNamespaceParameter();
        types.add(param);
        ParameterTypeString extractorParameter = new ParameterTypeString(PARAMETER_TEXT_QUERY, "Query that extracts the parts of a document, that should be used for vectorization. This query can be XPath or a regular expression. If a regular expression is used, the query must have the following form: '<regex-expression> <replacement-pattern>', where the <replacement_pattern> states how a match is replaced to generate the final information. '$1' would yield the first matching group as result. For both, XPath and regular expression, all matches are concatanated and then passed to the vectorization process.");
        extractorParameter.setExpert(true);
        types.add(extractorParameter);
        types.add(new ParameterTypeBoolean(PARAMETER_CREATE_TEXT_VISUALIZER, "Indicates if a text specific object visualizer should be created which can be used in plotters etc. Note: Text visualization does not work for id type number.", false));
        types.add(new ParameterTypeInt(PARAMETER_ON_THE_FLY_PRUNING, "Denotes after how many documents, singular terms should be removed from the word list. 0 indicates no pruning.", 0, Integer.MAX_VALUE, -1));
        return types;
    }
}

