package com.rapidminer.operator;

import com.rapidminer.operator.extraction.ExtractionException;
import com.rapidminer.operator.extraction.TextExtractionWrapper;
import com.rapidminer.operator.extraction.segmenter.DocumentSegmenter;
import com.rapidminer.operator.extraction.segmenter.DocumentSegmenterClass;
import com.rapidminer.operator.extraction.util.FeatureExtractionUtil;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeDirectory;
import com.rapidminer.parameter.ParameterTypePreview;
import com.rapidminer.parameter.ParameterTypeString;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;

/* loaded from: input_file:WEB-INF/lib/rapidminer-plugintext-1.0.0.jar:com/rapidminer/operator/DocumentSegmenterOperator.class */
public class DocumentSegmenterOperator extends Operator {
    public static final String PARAMETER_CONTENT_TYPE = "content_type";
    public static final String PARAMETER_OUTPUT = "output";
    public static final String PARAMETER_EXPRESSION = "expression";
    public static final String PARAMETER_IGNORE_CDATA = "ignore_cdata";

    public DocumentSegmenterOperator(OperatorDescription operatorDescription) {
        super(operatorDescription);
    }

    @Override // com.rapidminer.operator.Operator
    public IOObject[] apply() throws OperatorException {
        DocumentSegmenter segmenterFromParameters = DocumentSegmenterClass.getSegmenterFromParameters(getParameters());
        File parameterAsFile = getParameterAsFile("output", true);
        int i = 0;
        File[] listFiles = getParameterAsFile(TextInputOperator.PARAMETER_TEXTS).listFiles();
        for (int i2 = 0; i2 < listFiles.length; i2++) {
            if (listFiles[i2].isFile()) {
                try {
                    int determineType = isParameterSet("content_type") ? TextExtractionWrapper.determineType(getParameterAsString("content_type")) : TextExtractionWrapper.determineType(listFiles[i2]);
                    int lastIndexOf = listFiles[i2].getName().lastIndexOf(46);
                    String substring = lastIndexOf > -1 ? listFiles[i2].getName().substring(lastIndexOf + 1) : "txt";
                    Iterator<String> segments = segmenterFromParameters.getSegments(listFiles[i2], determineType);
                    while (segments.hasNext()) {
                        String str = String.valueOf(parameterAsFile.getAbsolutePath()) + File.separator + "seg" + i + "." + substring;
                        try {
                            BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(str));
                            bufferedWriter.write(segments.next());
                            bufferedWriter.close();
                            i++;
                        } catch (IOException e) {
                            throw new UserError(this, 303, str, e);
                        }
                    }
                } catch (ExtractionException e2) {
                    UserError userError = e2.getUserError();
                    userError.setOperator(this);
                    throw userError;
                }
            }
        }
        return new IOObject[0];
    }

    @Override // com.rapidminer.operator.Operator
    public Class<?>[] getInputClasses() {
        return new Class[0];
    }

    @Override // com.rapidminer.operator.Operator
    public Class<?>[] getOutputClasses() {
        return new Class[0];
    }

    @Override // com.rapidminer.operator.Operator, com.rapidminer.parameter.ParameterHandler
    public List<ParameterType> getParameterTypes() {
        List<ParameterType> parameterTypes = super.getParameterTypes();
        ParameterTypePreview parameterTypePreview = new ParameterTypePreview(SegmenterPreviewerCreator.class, this);
        parameterTypePreview.setExpert(false);
        parameterTypes.add(parameterTypePreview);
        parameterTypes.add(new ParameterTypeDirectory(TextInputOperator.PARAMETER_TEXTS, "A directory containing the documents to be segmented", false));
        parameterTypes.add(new ParameterTypeString("content_type", "The content type of the input texts (txt, xml, html)", true));
        parameterTypes.add(new ParameterTypeDirectory("output", "The directory to which to write the segments", false));
        parameterTypes.add(new ParameterTypeString(PARAMETER_EXPRESSION, "Specifies a regular expression or XPath expression that matches against substrings of the content which should be treated as individual segments. The syntax is the same as for attribute extraction (see WVTool operator), but instead of extracting only the first match, all matches are extracted and written to individual files", false));
        parameterTypes.add(new ParameterTypeBoolean(PARAMETER_IGNORE_CDATA, "Specifies whether CDATA should be ignored when parsing HTML", true));
        parameterTypes.add(FeatureExtractionUtil.createNamespaceParameter());
        return parameterTypes;
    }
}
