package eu.dnetlib.data.utility.resource_discovery.harvester;

import eu.dnetlib.data.utility.resource_discovery.crawler.Crawler;
import eu.dnetlib.data.utility.resource_discovery.crawler.ResourceExtractor;
import eu.dnetlib.data.utility.resource_discovery.url_filter.UrlFilter;
import java.io.IOException;
import java.util.Date;
import java.util.Iterator;
import java.util.Vector;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dlese.dpc.oai.harvester.Harvester;
import org.dlese.dpc.oai.harvester.Hexception;
import org.dlese.dpc.oai.harvester.OAIErrorException;
import org.dlese.dpc.xml.XMLDoc;
import org.dlese.dpc.xml.XMLException;

/* loaded from: input_file:eu/dnetlib/data/utility/resource_discovery/harvester/ResourceHarvester.class */
public class ResourceHarvester {
    private static final Log logger = LogFactory.getLog(ResourceHarvester.class);

    public static String[][] getRecordsFromRepository(String str) throws Hexception, OAIErrorException {
        return Harvester.harvest(str, "oai_dc", (String) null, (Date) null, (Date) null, (String) null, true);
    }

    public static String[][] getRecordsFromRepository(String str, String str2, Date date, Date date2) throws Hexception, OAIErrorException {
        return Harvester.harvest(str, "oai_dc", str2, date, date2, (String) null, true);
    }

    public static Vector<Vector<String>> getResourceAndLinks(String str, String str2, Crawler crawler, ResourceExtractor resourceExtractor) throws IOException, InterruptedException {
        Vector<Vector<String>> vector = new Vector<>();
        String dcIdentifier = getDcIdentifier(str2, str);
        if (dcIdentifier == null) {
            return null;
        }
        Vector<String> links = crawler.getLinks(UrlFilter.resolveRedirections(dcIdentifier));
        vector.add(links);
        vector.add(resourceExtractor.extractResource(links));
        return vector;
    }

    public static String getIdentifier(String str, String str2) throws IOException {
        XMLDoc xMLDoc = new XMLDoc();
        try {
            xMLDoc.useXmlString(str, true, true, true);
            try {
                String[] xmlFields = xMLDoc.getXmlFields(1, 0, "dc:identifier");
                Vector vector = new Vector();
                for (String str3 : xmlFields) {
                    if (UrlFilter.isUrl(str3)) {
                        vector.add(str3);
                    }
                }
                for (String str4 : xMLDoc.getXmlFields(0, 0, "dc:source")) {
                    if (UrlFilter.isUrl(str4)) {
                        vector.add(str4);
                    }
                }
                for (String str5 : xMLDoc.getXmlFields(0, 0, "dc:relation")) {
                    if (UrlFilter.isUrl(str5)) {
                        vector.add(str5);
                    }
                }
                if (vector.size() == 0) {
                    logger.debug("WARNING: The record " + str2 + " does not seem to have a field that is a url");
                    return null;
                }
                if (vector.size() == 1) {
                    return (String) vector.elementAt(0);
                }
                Iterator it = vector.iterator();
                while (it.hasNext()) {
                    String str6 = (String) it.next();
                    if (UrlFilter.checkExtension(str6)) {
                        return str6;
                    }
                }
                logger.debug("WARNING: The record " + str2 + " has multiple fields with valid urls and there is no way to choose one. The first one will be used");
                return (String) vector.elementAt(0);
            } catch (XMLException e) {
                logger.debug("WARNING: The record " + str2 + " does not seem to have a dc:identifier field");
                return null;
            }
        } catch (XMLException e2) {
            logger.debug("WARNING: The record " + str2 + " seems to be malformed (deleted maybe?)");
            return null;
        }
    }

    public static String getDcIdentifier(String str, String str2) throws IOException {
        XMLDoc xMLDoc = new XMLDoc();
        try {
            xMLDoc.useXmlString(str, true, true, true);
            try {
                String[] xmlFields = xMLDoc.getXmlFields(1, 0, "dc:identifier");
                Vector vector = new Vector();
                for (String str3 : xmlFields) {
                    if (UrlFilter.isUrl(str3)) {
                        vector.add(str3);
                    }
                }
                if (vector.size() == 0) {
                    logger.debug("WARNING: The record " + str2 + " does not seem to have a field that is a url");
                    return null;
                }
                if (vector.size() == 1) {
                    return (String) vector.elementAt(0);
                }
                Iterator it = vector.iterator();
                while (it.hasNext()) {
                    String str4 = (String) it.next();
                    if (UrlFilter.checkExtension(str4)) {
                        return str4;
                    }
                }
                logger.debug("WARNING: The record " + str2 + " has multiple fields with valid urls and there is no way to choose one. The first one will be used");
                return (String) vector.elementAt(0);
            } catch (XMLException e) {
                logger.debug("WARNING: The record " + str2 + " does not seem to have a dc:identifier field");
                return null;
            }
        } catch (XMLException e2) {
            logger.debug("WARNING: The record " + str2 + " seems to be malformed (deleted maybe?)");
            return null;
        }
    }

    public static void main(String[] strArr) {
        int i;
        if (strArr.length == 0) {
            System.err.println("Usage:\t program_name baseUrl [-from day:month:year] [-until day:month:year] [-set setName]");
            return;
        }
        String str = strArr[0];
        String str2 = null;
        Date date = null;
        Date date2 = null;
        int i2 = 1;
        while (i2 < strArr.length) {
            if (strArr[i2].equals("-set") && i2 + 1 < strArr.length) {
                str2 = strArr[i2 + 1];
                i = i2 + 1;
            } else if (strArr[i2].equals("-from") && i2 + 1 < strArr.length) {
                String[] split = strArr[i2 + 1].split(":");
                i = i2 + 1;
                if (split.length != 3) {
                    System.err.println("The date format for the -from field provided is incorrect");
                    return;
                }
                date = new Date(Integer.parseInt(split[2]) - 1900, Integer.parseInt(split[1]) - 1, Integer.parseInt(split[0]));
            } else {
                if (!strArr[i2].equals("-until") || i2 + 1 >= strArr.length) {
                    System.err.println("The arguments you specified are incorrect");
                    return;
                }
                String[] split2 = strArr[i2 + 1].split(":");
                i = i2 + 1;
                if (split2.length != 3) {
                    System.err.println("The date format for the -until field provided is incorrect");
                    return;
                }
                date2 = new Date(Integer.parseInt(split2[2]) - 1900, Integer.parseInt(split2[1]) - 1, Integer.parseInt(split2[0]));
            }
            i2 = i + 1;
        }
        try {
            String[][] recordsFromRepository = getRecordsFromRepository(str, str2, date, date2);
            try {
                Crawler crawler = new Crawler();
                ResourceExtractor resourceExtractor = new ResourceExtractor();
                for (int i3 = 0; i3 < recordsFromRepository.length; i3++) {
                    String str3 = "";
                    try {
                        str3 = getDcIdentifier(recordsFromRepository[i3][1], recordsFromRepository[i3][0]);
                        if (str3 != null) {
                            String resolveRedirections = UrlFilter.resolveRedirections(str3);
                            System.out.println("Now processing " + resolveRedirections);
                            Vector<String> links = crawler.getLinks(resolveRedirections);
                            System.out.println("Retrieved links are: " + links);
                            System.out.println("Resources seem to be available in: " + resourceExtractor.extractResource(links));
                            System.out.println();
                        }
                    } catch (Exception e) {
                        System.err.println("WARNING: Record " + str3 + " couldn't be processed");
                    }
                }
            } catch (Exception e2) {
                System.err.println("FATAL ERROR: Crawler could not be configured. Please check your robot.xml parameters and try again.");
                System.err.println(e2.getLocalizedMessage());
                e2.printStackTrace();
            }
        } catch (Exception e3) {
            System.err.println(e3.getLocalizedMessage());
            e3.printStackTrace();
        }
    }
}
