/*
 * Decompiled with CFR 0.152.
 */
package com.rapidminer.operator;

import com.rapidminer.example.Attribute;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.ListDataRowReader;
import com.rapidminer.example.table.MemoryExampleTable;
import com.rapidminer.gui.properties.PropertyTable;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.crawler.CrawlerPolicyProperties;
import com.rapidminer.operator.crawler.ParameterTypeCrawlerPolicy;
import com.rapidminer.operator.crawler.RapidMinerBasedCrawler;
import com.rapidminer.operator.crawler.StringMatchingLiteral;
import com.rapidminer.operator.crawler.StringMatchingRuleSet;
import com.rapidminer.operator.visualization.dependencies.NumericalMatrix;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeDirectory;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.ParameterTypeList;
import com.rapidminer.parameter.ParameterTypeString;
import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.StringTokenizer;
import javax.swing.JOptionPane;
import websphinx.DownloadParameters;
import websphinx.Link;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class CrawlerOperator
extends Operator {
    public static final String PARAMETER_URL = "url";
    public static final String PARAMETER_CRAWLING_RULES = "crawling_rules";
    public static final String PARAMETER_MAX_DEPTH = "max_depth";
    public static final String PARAMETER_DELAY = "delay";
    public static final String PARAMETER_MAX_THREADS = "max_threads";
    public static final String PARAMETER_OUTPUT_DIR = "output_dir";
    public static final String PARAMETER_EXTENSION = "extension";
    public static final String PARAMETER_MAX_PAGE_SIZE = "max_page_size";
    public static final String PARAMETER_USER_AGENT = "user_agent";
    public static final String PARAMETER_OBEY_ROBOT_EXCLUSION = "obey_robot_exclusion";

    static {
        PropertyTable.registerPropertyKeyCellEditor(ParameterTypeCrawlerPolicy.class, CrawlerPolicyProperties.class);
    }

    public CrawlerOperator(OperatorDescription description) {
        super(description);
    }

    @Override
    public IOObject[] apply() throws OperatorException {
        HashMap<String, StringMatchingRuleSet> rules = new HashMap<String, StringMatchingRuleSet>();
        List<String[]> visitLinkRules = this.getParameterList(PARAMETER_CRAWLING_RULES);
        for (String[] param : visitLinkRules) {
            String key = (String)((Object[])param)[0];
            String rule = (String)((Object[])param)[1];
            LinkedList<StringMatchingLiteral> conj = new LinkedList<StringMatchingLiteral>();
            StringTokenizer ruleTokens = new StringTokenizer(rule, " ");
            while (ruleTokens.hasMoreTokens()) {
                String literal = ruleTokens.nextToken();
                boolean negation = false;
                if (literal.charAt(0) == '-') {
                    negation = true;
                    literal = literal.substring(1);
                }
                conj.add(new StringMatchingLiteral(literal, negation));
            }
            StringMatchingRuleSet ruleSet = (StringMatchingRuleSet)rules.get(key);
            if (ruleSet == null) {
                ruleSet = new StringMatchingRuleSet();
                rules.put(key, ruleSet);
            }
            ruleSet.addConjunction(conj);
        }
        RapidMinerBasedCrawler crawler = new RapidMinerBasedCrawler(rules, this.getParameterAsFile(PARAMETER_OUTPUT_DIR, true), this.getParameterAsString(PARAMETER_EXTENSION), this.getParameterAsInt(PARAMETER_DELAY), this.getParameterAsString(PARAMETER_USER_AGENT), this);
        DownloadParameters downloadParams = crawler.getDownloadParameters();
        downloadParams = downloadParams.changeMaxThreads(this.getParameterAsInt(PARAMETER_MAX_THREADS));
        downloadParams = downloadParams.changeObeyRobotExclusion(this.getParameterAsBoolean(PARAMETER_OBEY_ROBOT_EXCLUSION));
        downloadParams = downloadParams.changeUserAgent(this.getParameterAsString(PARAMETER_USER_AGENT));
        downloadParams = downloadParams.changeMaxPageSize(this.getParameterAsInt(PARAMETER_MAX_PAGE_SIZE));
        crawler.setDownloadParameters(downloadParams);
        int robotsDialogResult = -1;
        if (!this.getParameterAsBoolean(PARAMETER_OBEY_ROBOT_EXCLUSION)) {
            robotsDialogResult = JOptionPane.showConfirmDialog(null, "You disabled the support for the robots.txt. Do this only if you know what you are doing and if you are sure not to violate any laws or terms of use. Do you wish to proceed?", "Warning: Disabled robot exclusion", 0, 2);
        }
        if (robotsDialogResult == 1) {
            this.logNote("Crawling operation aborded by the user");
            return new IOObject[0];
        }
        crawler.setMaxDepth(this.getParameterAsInt(PARAMETER_MAX_DEPTH));
        MemoryExampleTable et = new MemoryExampleTable(crawler.getCrawlerExtractedAttributes());
        String url = this.getParameterAsString(PARAMETER_URL);
        try {
            crawler.setRoot(new Link(url));
            crawler.run();
        }
        catch (MalformedURLException e) {
            throw new UserError((Operator)this, 212, url, e);
        }
        et.readExamples(new ListDataRowReader(crawler.getDataRows().iterator()));
        ExampleSet es = et.createExampleSet(new HashMap<Attribute, String>());
        NumericalMatrix linkMatrix = crawler.getLinkMatrix();
        return new IOObject[]{es, linkMatrix};
    }

    @Override
    public Class<?>[] getInputClasses() {
        return new Class[0];
    }

    @Override
    public Class<?>[] getOutputClasses() {
        return new Class[]{ExampleSet.class, NumericalMatrix.class};
    }

    @Override
    public List<ParameterType> getParameterTypes() {
        List<ParameterType> types = super.getParameterTypes();
        ParameterType type = new ParameterTypeString(PARAMETER_URL, "Specifies the url at which the crawler should start", false);
        type.setExpert(false);
        types.add(type);
        type = new ParameterTypeList(PARAMETER_CRAWLING_RULES, "Specifies a set of rules that determine, which links to follow and which pages to process (see tutorial for details)", new ParameterTypeCrawlerPolicy("property", "the value of the property"));
        type.setExpert(false);
        types.add(type);
        type = new ParameterTypeInt(PARAMETER_MAX_DEPTH, "Specifies the maximal depth of the crawling process", 0, Integer.MAX_VALUE, 2);
        type.setExpert(false);
        types.add(type);
        type = new ParameterTypeInt(PARAMETER_DELAY, "Specifies the delay when vistiting a page in milleseconds", 0, Integer.MAX_VALUE, 1000);
        type.setExpert(false);
        types.add(type);
        type = new ParameterTypeInt(PARAMETER_MAX_THREADS, "Specifies the number of crawling threads working in parallel", 1, Integer.MAX_VALUE, 1);
        type.setExpert(true);
        types.add(type);
        type = new ParameterTypeDirectory(PARAMETER_OUTPUT_DIR, "Specifies the directory to which to write the files", false);
        type.setExpert(false);
        types.add(type);
        type = new ParameterTypeString(PARAMETER_EXTENSION, "Specifies the extension of the stored files", "txt");
        type.setExpert(false);
        types.add(type);
        type = new ParameterTypeInt(PARAMETER_MAX_PAGE_SIZE, "Specifies the maximum page size (in KB): pages larger than this limit are not downloaded", 1, Integer.MAX_VALUE, 100);
        type.setExpert(false);
        types.add(type);
        type = new ParameterTypeString(PARAMETER_USER_AGENT, "The identity the crawler uses while accessing a server", "rapid-miner-crawler");
        type.setExpert(true);
        types.add(type);
        type = new ParameterTypeBoolean(PARAMETER_OBEY_ROBOT_EXCLUSION, "Specifies whether the crawler obeys the rules, which pages on site might be visited by a robot. Disable only if you know what you are doing and if you a sure not to violate any existing laws by doing so", true);
        type.setExpert(true);
        types.add(type);
        return types;
    }
}

