package gr.uoa.di.resourcediscovery.methods;

import gr.uoa.di.resourcediscovery.MalformedConfigurationException;
import gr.uoa.di.resourcediscovery.MethodProvider;
import gr.uoa.di.resourcediscovery.Toolkit;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.log4j.Logger;
import org.archive.modules.net.RobotsDirectives;
import org.archive.modules.net.Robotstxt;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.traversal.DocumentTraversal;
import org.w3c.dom.traversal.NodeFilter;
import org.w3c.dom.traversal.NodeIterator;
import org.xml.sax.SAXException;

/* loaded from: input_file:gr/uoa/di/resourcediscovery/methods/XPathAndCrawl.class */
public class XPathAndCrawl implements ResourceDiscoveryMethod {
    transient Logger logger;
    private boolean resolveFrames;
    private boolean skipFirstPage;
    private long sleepMillis;
    private boolean ignoreRobotsTxt;
    private String agentName;
    private List<String> mimeTypes;
    private boolean fallback;
    private String robotstxtUrl;
    private transient Robotstxt robot;
    private transient RobotsDirectives directives;
    private List<String> xpaths;

    public XPathAndCrawl() {
        this.logger = Logger.getLogger(XPathAndCrawl.class);
        this.resolveFrames = true;
        this.skipFirstPage = false;
        this.sleepMillis = 100L;
        this.ignoreRobotsTxt = false;
        this.agentName = "OpenAIRE_Harvester";
        this.mimeTypes = new ArrayList();
        this.fallback = true;
        this.robotstxtUrl = null;
        this.robot = null;
        this.directives = null;
        this.xpaths = new ArrayList();
        this.ignoreRobotsTxt = true;
    }

    public XPathAndCrawl(List<String> list, String str) throws FileNotFoundException, IOException {
        this.logger = Logger.getLogger(XPathAndCrawl.class);
        this.resolveFrames = true;
        this.skipFirstPage = false;
        this.sleepMillis = 100L;
        this.ignoreRobotsTxt = false;
        this.agentName = "OpenAIRE_Harvester";
        this.mimeTypes = new ArrayList();
        this.fallback = true;
        this.robotstxtUrl = null;
        this.robot = null;
        this.directives = null;
        this.xpaths = new ArrayList();
        this.mimeTypes.addAll(list);
        if (str == null) {
            this.ignoreRobotsTxt = true;
            return;
        }
        try {
            this.robot = new Robotstxt(new BufferedReader(new InputStreamReader(new URL(str).openStream())));
            this.directives = this.robot.getDirectivesFor(this.agentName);
        } catch (FileNotFoundException e) {
            this.logger.debug("Robots.txt was not found at " + str);
            this.ignoreRobotsTxt = true;
        }
    }

    public void setRobotstxt(String str) throws FileNotFoundException, IOException {
        this.robotstxtUrl = str;
        if (str == null) {
            this.ignoreRobotsTxt = true;
            return;
        }
        try {
            this.robot = new Robotstxt(new BufferedReader(new InputStreamReader(new URL(str).openStream())));
            this.directives = this.robot.getDirectivesFor(this.agentName);
        } catch (FileNotFoundException e) {
            this.logger.debug("Robots.txt was not found at " + str);
            this.ignoreRobotsTxt = true;
        }
    }

    public String getRobotstxtUrl() {
        return this.robotstxtUrl;
    }

    @Override // gr.uoa.di.resourcediscovery.methods.ResourceDiscoveryMethod
    public List<String> getResources(URL url, MethodProvider methodProvider) throws SAXException, IOException {
        String url2 = url.toString();
        this.logger.debug("Known xpaths: " + this.xpaths);
        String redirectedUrl = Toolkit.getRedirectedUrl(url2, this.sleepMillis);
        this.logger.debug("Resolved possible redirections. Url: " + redirectedUrl);
        ArrayList arrayList = new ArrayList();
        ArrayList<String> arrayList2 = new ArrayList();
        arrayList2.add(redirectedUrl);
        if (this.mimeTypes.contains(Toolkit.getMimeType(redirectedUrl, this.sleepMillis))) {
            arrayList.add(Toolkit.makeAbsolute(redirectedUrl, new URL(redirectedUrl)));
            return arrayList;
        }
        if (this.resolveFrames) {
            DOMParser dOMParser = new DOMParser();
            dOMParser.parse(redirectedUrl);
            arrayList2.addAll(resolveFrames(dOMParser.getDocument(), new URL(redirectedUrl)));
            this.logger.debug("urls after resolving frames: " + arrayList2);
        }
        if (this.skipFirstPage) {
            ArrayList arrayList3 = new ArrayList();
            for (String str : arrayList2) {
                DOMParser dOMParser2 = new DOMParser();
                dOMParser2.parse(str);
                arrayList3.addAll(oneDepthDown(dOMParser2.getDocument(), new URL(str)));
            }
            arrayList2.remove(redirectedUrl);
            if (this.resolveFrames) {
                for (String str2 : arrayList2) {
                    DOMParser dOMParser3 = new DOMParser();
                    dOMParser3.parse(str2);
                    arrayList3.addAll(resolveFrames(dOMParser3.getDocument(), new URL(str2)));
                }
            }
            arrayList2.addAll(arrayList3);
            this.logger.debug("urls after skipping 1st page and resolving frames: " + arrayList2);
        }
        for (String str3 : arrayList2) {
            this.logger.debug("looking for resource in: " + str3);
            try {
                URL url3 = new URL(Toolkit.makeAbsolute(str3, new URL(redirectedUrl)));
                if (!this.ignoreRobotsTxt && !this.directives.allows(Toolkit.makeRelative(url3))) {
                    this.logger.debug("Skipping " + url3 + ". Disallowed by robots.txt directives.");
                } else if (this.xpaths.size() == 0) {
                    this.logger.debug("No xpath information, crawling");
                    DOMParser dOMParser4 = new DOMParser();
                    dOMParser4.parse(url3.toString());
                    Document document = dOMParser4.getDocument();
                    Iterator<Node> it = findNodesWithResource(document, url3).iterator();
                    while (it.hasNext()) {
                        String xpathToRoot = getXpathToRoot(it.next());
                        this.xpaths.add(xpathToRoot);
                        this.logger.debug(xpathToRoot);
                    }
                    try {
                        URL url4 = new URL(redirectedUrl);
                        methodProvider.setMethod(new URL(url4.getProtocol() + "://" + url4.getHost()), this);
                    } catch (MalformedConfigurationException e) {
                        this.logger.error("Error updating xpath information", e);
                    }
                    Iterator<String> it2 = this.xpaths.iterator();
                    while (it2.hasNext()) {
                        String resourceUrl = getResourceUrl(it2.next(), document, url3);
                        if (resourceUrl != null) {
                            this.logger.debug(resourceUrl);
                            arrayList.add(resourceUrl);
                        }
                    }
                } else {
                    DOMParser dOMParser5 = new DOMParser();
                    dOMParser5.parse(url3.toString());
                    Document document2 = dOMParser5.getDocument();
                    Iterator<String> it3 = this.xpaths.iterator();
                    while (it3.hasNext()) {
                        String resourceUrl2 = getResourceUrl(it3.next(), document2, url3);
                        if (resourceUrl2 != null) {
                            this.logger.debug(resourceUrl2);
                            arrayList.add(resourceUrl2);
                        }
                    }
                }
            } catch (Exception e2) {
                e2.printStackTrace();
            }
        }
        if (arrayList.size() == 0 && this.fallback) {
            for (String str4 : arrayList2) {
                this.logger.debug("looking for resource in (not found in xpath): " + str4);
                try {
                    URL url5 = new URL(Toolkit.makeAbsolute(str4, new URL(redirectedUrl)));
                    if (this.ignoreRobotsTxt || this.directives.allows(Toolkit.makeRelative(url5))) {
                        DOMParser dOMParser6 = new DOMParser();
                        dOMParser6.parse(url5.toString());
                        Document document3 = dOMParser6.getDocument();
                        Iterator<Node> it4 = findNodesWithResource(document3, url5).iterator();
                        while (it4.hasNext()) {
                            String xpathToRoot2 = getXpathToRoot(it4.next());
                            this.xpaths.add(xpathToRoot2);
                            this.logger.debug(xpathToRoot2);
                        }
                        try {
                            URL url6 = new URL(redirectedUrl);
                            methodProvider.setMethod(new URL(url6.getProtocol() + "://" + url6.getHost()), this);
                        } catch (MalformedConfigurationException e3) {
                            this.logger.error("Error updating xpath information", e3);
                        }
                        Iterator<String> it5 = this.xpaths.iterator();
                        while (it5.hasNext()) {
                            String resourceUrl3 = getResourceUrl(it5.next(), document3, url5);
                            if (resourceUrl3 != null) {
                                this.logger.debug(resourceUrl3);
                                arrayList.add(resourceUrl3);
                            }
                        }
                    } else {
                        this.logger.debug("Skipping " + url5 + ". Disallowed by robots.txt directives.");
                    }
                } catch (Exception e4) {
                    e4.printStackTrace();
                }
            }
        }
        return arrayList;
    }

    private List<String> resolveFrames(Document document, URL url) {
        ArrayList arrayList = new ArrayList();
        try {
            NodeIterator createNodeIterator = ((DocumentTraversal) document).createNodeIterator(document, 1, (NodeFilter) null, true);
            Node nextNode = createNodeIterator.nextNode();
            while (true) {
                Node node = nextNode;
                if (node == null) {
                    return arrayList;
                }
                if (node.getNodeName().equals("FRAME") || node.getNodeName().equals("IFRAME")) {
                    try {
                        arrayList.add(Toolkit.makeAbsolute(node.getAttributes().getNamedItem("src").getNodeValue(), url));
                    } catch (MalformedURLException e) {
                    }
                }
                nextNode = createNodeIterator.nextNode();
            }
        } catch (Exception e2) {
            e2.printStackTrace();
            return arrayList;
        }
    }

    private List<String> oneDepthDown(Document document, URL url) throws IOException {
        ArrayList arrayList = new ArrayList();
        try {
            NodeIterator createNodeIterator = ((DocumentTraversal) document).createNodeIterator(document, 1, (NodeFilter) null, true);
            Node nextNode = createNodeIterator.nextNode();
            while (true) {
                Node node = nextNode;
                if (node == null) {
                    return arrayList;
                }
                if (node.getNodeName().equals("A")) {
                    try {
                        String makeAbsolute = Toolkit.makeAbsolute(node.getAttributes().getNamedItem("href").getNodeValue(), url);
                        if (Toolkit.getMimeType(makeAbsolute, this.sleepMillis).trim().contains("text/html")) {
                            arrayList.add(makeAbsolute);
                        }
                    } catch (MalformedURLException e) {
                    }
                }
                nextNode = createNodeIterator.nextNode();
            }
        } catch (Exception e2) {
            e2.printStackTrace();
            return arrayList;
        }
    }

    private String getXpathToRoot(Node node) {
        String str = "";
        while (!node.getNodeName().equals("HTML")) {
            int i = 0;
            Node node2 = node;
            while (true) {
                Node previousSibling = node2.getPreviousSibling();
                node2 = previousSibling;
                if (previousSibling == null) {
                    break;
                }
                if (node2.getNodeName().equals(node.getNodeName())) {
                    i++;
                }
            }
            str = "/" + node.getNodeName() + "[" + (i + 1) + "]" + str;
            Node parentNode = node.getParentNode();
            node = parentNode;
            if (parentNode == null) {
                return str;
            }
        }
        int i2 = 1;
        while (true) {
            Node previousSibling2 = node.getPreviousSibling();
            node = previousSibling2;
            if (previousSibling2 == null) {
                return "/HTML[" + i2 + "]" + str;
            }
            i2++;
        }
    }

    private List<Node> findNodesWithResource(Document document, URL url) throws IOException {
        ArrayList arrayList = new ArrayList();
        try {
            NodeIterator createNodeIterator = ((DocumentTraversal) document).createNodeIterator(document, 1, (NodeFilter) null, true);
            Node nextNode = createNodeIterator.nextNode();
            while (true) {
                Node node = nextNode;
                if (node == null) {
                    return arrayList;
                }
                if (node.getNodeName().equals("A")) {
                    try {
                        String nodeValue = node.getAttributes().getNamedItem("href").getNodeValue();
                        if (nodeValue != null) {
                            try {
                                if (this.mimeTypes.contains(Toolkit.getMimeType(Toolkit.makeAbsolute(nodeValue, url), this.sleepMillis).trim())) {
                                    arrayList.add(node);
                                }
                            } catch (MalformedURLException e) {
                            }
                        }
                    } catch (NullPointerException e2) {
                    }
                }
                nextNode = createNodeIterator.nextNode();
            }
        } catch (Exception e3) {
            e3.printStackTrace();
            return arrayList;
        }
    }

    private String getResourceUrl(String str, Document document, URL url) throws MalformedURLException {
        try {
            Node firstChild = document.getFirstChild();
            for (String str2 : str.split("/")) {
                if (!str2.trim().equals("")) {
                    int parseInt = Integer.parseInt(str2.substring(str2.indexOf(91)).replaceAll("\\[", "").replaceAll("\\]", ""));
                    String substring = str2.substring(0, str2.indexOf(91));
                    int i = 0;
                    while (true) {
                        if (firstChild.getNodeName().equals(substring)) {
                            i++;
                            if (i == parseInt) {
                                firstChild = firstChild.getFirstChild();
                                break;
                            }
                        }
                        Node nextSibling = firstChild.getNextSibling();
                        firstChild = nextSibling;
                        if (nextSibling == null) {
                            break;
                        }
                    }
                }
            }
            return Toolkit.makeAbsolute(firstChild.getParentNode().getAttributes().getNamedItem("href").getNodeValue(), url);
        } catch (Exception e) {
            return null;
        }
    }

    private Object readResolve() throws IOException {
        if (this.robotstxtUrl != null) {
            this.robot = new Robotstxt(new BufferedReader(new InputStreamReader(new URL(this.robotstxtUrl).openStream())));
            this.directives = this.robot.getDirectivesFor(this.agentName);
        } else {
            this.ignoreRobotsTxt = true;
        }
        this.logger = Logger.getLogger(XPathAndCrawl.class);
        return this;
    }

    public boolean isResolveFrames() {
        return this.resolveFrames;
    }

    public void setResolveFrames(boolean z) {
        this.resolveFrames = z;
    }

    public boolean isSkipFirstPage() {
        return this.skipFirstPage;
    }

    public void setSkipFirstPage(boolean z) {
        this.skipFirstPage = z;
    }

    public long getSleepMillis() {
        return this.sleepMillis;
    }

    public void setSleepMillis(long j) {
        this.sleepMillis = j;
    }

    public List<String> getMimeTypes() {
        return this.mimeTypes;
    }

    public void setMimeTypes(List<String> list) {
        this.mimeTypes = list;
    }

    public List<String> getXpaths() {
        return this.xpaths;
    }

    public void setXpaths(List<String> list) {
        this.xpaths = list;
    }

    public void setIgnoreRobotsTxt(boolean z) {
        this.ignoreRobotsTxt = z;
    }

    public boolean isIgnoreRobotsTxt() {
        return this.ignoreRobotsTxt;
    }

    public void setAgentName(String str) {
        this.agentName = str;
        this.directives = this.robot.getDirectivesFor(str);
    }

    public String getAgentName() {
        return this.agentName;
    }

    public void setFallback(boolean z) {
        this.fallback = z;
    }

    public boolean isFallback() {
        return this.fallback;
    }
}
