package org.gcube.textextractor.extractors;

import com.hp.hpl.jena.sparql.ARQConstants;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import javax.xml.namespace.NamespaceContext;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.apache.tika.language.LanguageIdentifier;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.xml.XMLParser;
import org.apache.tika.sax.BodyContentHandler;
import org.codehaus.jackson.util.MinimalPrettyPrinter;
import org.gcube.semantic.annotator.AnnotationBase;
import org.gcube.semantic.annotator.utils.ANNOTATIONS;
import org.gcube.textextractor.entities.ExtractedEntity;
import org.gcube.textextractor.entities.ShortenCE4NameResponse;
import org.gcube.textextractor.helpers.ExtractorHelper;
import org.gcube.textextractor.helpers.XPathHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.xml.sax.SAXException;

/* loaded from: input_file:WEB-INF/lib/smartfish-doc-processor-2.1.1-3.7.0.jar:org/gcube/textextractor/extractors/XMLExtractor.class */
public class XMLExtractor extends InformationExtractor {
    private static final Logger logger = LoggerFactory.getLogger(XMLExtractor.class);

    /* loaded from: input_file:WEB-INF/lib/smartfish-doc-processor-2.1.1-3.7.0.jar:org/gcube/textextractor/extractors/XMLExtractor$FigisNamespaceContext.class */
    public static class FigisNamespaceContext implements NamespaceContext {
        private final Map<String, String> prefixMap;

        FigisNamespaceContext(Map<String, String> map) {
            if (map != null) {
                this.prefixMap = Collections.unmodifiableMap(new HashMap(map));
            } else {
                this.prefixMap = Collections.emptyMap();
            }
        }

        @Override // javax.xml.namespace.NamespaceContext
        public String getPrefix(String str) {
            return null;
        }

        @Override // javax.xml.namespace.NamespaceContext
        public Iterator<?> getPrefixes(String str) {
            return null;
        }

        @Override // javax.xml.namespace.NamespaceContext
        public String getNamespaceURI(String str) {
            if (str == null) {
                throw new NullPointerException("Invalid Namespace Prefix");
            }
            return "fi".equalsIgnoreCase(str) ? "http://www.fao.org/fi/figis/devcon/" : "argls".equals(str) ? "http://www.naa.gov.au/recordkeeping/gov_online/agls/1.1" : "ags".equals(str) ? "http://www.purl.org/agmes/1.1/" : "aida".equals(str) ? "http://www.idmlinitiative.org/resources/dtds/AIDA22.xsd" : DublinCore.PREFIX_DC.equals(str) ? "http://purl.org/dc/elements/1.1/" : DublinCore.PREFIX_DC_TERMS.equals(str) ? "http://purl.org/dc/terms/" : "xsi".equals(str) ? "http://www.w3.org/2001/XMLSchema-instance" : "xml".equals(str) ? "http://www.w3.org/XML/1998/namespace" : "";
        }
    }

    @Override // org.gcube.textextractor.extractors.InformationExtractor
    public Map<String, String> extractFieldsFromFile(String str) throws Exception {
        logger.info("Processing file : " + str);
        long currentTimeMillis = System.currentTimeMillis();
        try {
            try {
                FileInputStream fileInputStream = new FileInputStream(str);
                Throwable th = null;
                try {
                    BodyContentHandler bodyContentHandler = new BodyContentHandler();
                    Metadata metadata = new Metadata();
                    metadata.set("Content-Type", "application/xml; charset=utf-8");
                    new XMLParser().parse(fileInputStream, bodyContentHandler, metadata, new ParseContext());
                    String removeEmptyLines = ExtractorHelper.removeEmptyLines(bodyContentHandler.toString());
                    HashMap hashMap = new HashMap();
                    hashMap.put("documentID", str);
                    hashMap.put("text", removeEmptyLines);
                    hashMap.put("language", new LanguageIdentifier(removeEmptyLines).getLanguage());
                    hashMap.put("provenance", "FIRMS");
                    long currentTimeMillis2 = System.currentTimeMillis();
                    hashMap.putAll(customFields(str));
                    logger.info("~> field extraction time  : " + ((System.currentTimeMillis() - currentTimeMillis2) / 1000.0d) + " secs");
                    if (fileInputStream != null) {
                        if (0 != 0) {
                            try {
                                fileInputStream.close();
                            } catch (Throwable th2) {
                                th.addSuppressed(th2);
                            }
                        } else {
                            fileInputStream.close();
                        }
                    }
                    logger.info("time processing file : " + str + " : " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d) + " secs");
                    return hashMap;
                } catch (Throwable th3) {
                    if (fileInputStream != null) {
                        if (0 != 0) {
                            try {
                                fileInputStream.close();
                            } catch (Throwable th4) {
                                th.addSuppressed(th4);
                            }
                        } else {
                            fileInputStream.close();
                        }
                    }
                    throw th3;
                }
            } catch (Exception e) {
                e.printStackTrace();
                throw e;
            }
        } catch (Throwable th5) {
            logger.info("time processing file : " + str + " : " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d) + " secs");
            throw th5;
        }
    }

    @Override // org.gcube.textextractor.extractors.InformationExtractor
    public List<Map<String, String>> extractInfo(String str) throws FileNotFoundException {
        ArrayList arrayList = new ArrayList();
        int i = 0;
        for (String str2 : ExtractorHelper.getFilenames(str)) {
            i++;
            logger.info("Processing file : " + i + MinimalPrettyPrinter.DEFAULT_ROOT_VALUE_SEPARATOR + str2);
            try {
                Map<String, String> extractFieldsFromFile = extractFieldsFromFile(str2);
                long currentTimeMillis = System.currentTimeMillis();
                Map<String, String> enrichRecord = enrichRecord(extractFieldsFromFile, str2);
                logger.info("~> field enrichment time  : " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d) + " secs");
                arrayList.add(enrichRecord);
            } catch (Exception e) {
                logger.error("error while extracting info from : " + str2 + " . will skip this file", (Throwable) e);
            }
        }
        return arrayList;
    }

    @Override // org.gcube.textextractor.extractors.InformationExtractor
    public Map<String, String> enrichRecord(Map<String, String> map, String str) {
        HashMap hashMap = new HashMap();
        HashMap hashMap2 = new HashMap();
        String[] split = str.split(ARQConstants.allocSSEUnamedVars);
        String str2 = "http://smartfish.collection/firms/" + ((split == null || split.length <= 1) ? str : split[split.length - 2]).toLowerCase();
        hashMap.putAll(map);
        hashMap.put("documentID", str2);
        ExtractorHelper.enrichListField(map, hashMap, hashMap2, ANNOTATIONS.getLocalName(ANNOTATIONS.COUNTRY), new ExtractorHelper.QueryWrapperList() { // from class: org.gcube.textextractor.extractors.XMLExtractor.1
            @Override // org.gcube.textextractor.helpers.ExtractorHelper.QueryWrapperList
            public String doCall(List<ExtractedEntity> list) throws Exception {
                return ExtractorHelper.queryCountry(list);
            }
        });
        ExtractorHelper.enrichSimpleField(map, hashMap, hashMap2, ANNOTATIONS.getLocalName(ANNOTATIONS.YEAR), new ExtractorHelper.QueryWrapperSimple() { // from class: org.gcube.textextractor.extractors.XMLExtractor.2
            @Override // org.gcube.textextractor.helpers.ExtractorHelper.QueryWrapperSimple
            public String doCall(ExtractedEntity extractedEntity) throws Exception {
                return ExtractorHelper.queryYear(extractedEntity);
            }
        });
        ExtractorHelper.enrichListField(map, hashMap, hashMap2, ANNOTATIONS.getLocalName(ANNOTATIONS.STATUS), new ExtractorHelper.QueryWrapperList() { // from class: org.gcube.textextractor.extractors.XMLExtractor.3
            @Override // org.gcube.textextractor.helpers.ExtractorHelper.QueryWrapperList
            public String doCall(List<ExtractedEntity> list) throws Exception {
                return ExtractorHelper.queryExploitationStatus(list);
            }
        });
        try {
            if (map.get("species_english_name") != null && map.get("species_english_name").trim().length() > 0) {
                List asList = Arrays.asList(map.get("species_english_name").split("\\s*,\\s*"));
                ArrayList arrayList = new ArrayList();
                Iterator it = asList.iterator();
                while (it.hasNext()) {
                    arrayList.add(new ExtractedEntity((String) it.next(), ""));
                }
                String querySpecies = ExtractorHelper.querySpecies(arrayList);
                hashMap2.put("species_uris", ShortenCE4NameResponse.getURIFromJSON(querySpecies));
                hashMap.put("species_uris", querySpecies);
            }
        } catch (Exception e) {
            logger.warn("Error processing species : " + map.get("species_english_name"), (Throwable) e);
        }
        ExtractorHelper.enrichListField(map, hashMap, hashMap2, ANNOTATIONS.getLocalName(ANNOTATIONS.MANAGEMENT), new ExtractorHelper.QueryWrapperList() { // from class: org.gcube.textextractor.extractors.XMLExtractor.4
            @Override // org.gcube.textextractor.helpers.ExtractorHelper.QueryWrapperList
            public String doCall(List<ExtractedEntity> list) throws Exception {
                return ExtractorHelper.queryManagement(list);
            }
        });
        try {
            String str3 = map.get("management");
            if (str3 != null) {
                if (str3.equalsIgnoreCase("true")) {
                    str3 = "Management Unit";
                }
                String queryManagement = ExtractorHelper.queryManagement(new ExtractedEntity(str3, ""));
                hashMap2.put("management_uris", ShortenCE4NameResponse.getURIFromJSON(queryManagement));
                hashMap.put("management_uris", queryManagement);
            }
        } catch (Exception e2) {
            logger.warn("Error processing management : " + map.get("management"), (Throwable) e2);
        }
        try {
            annotate(str2, hashMap2);
        } catch (FileNotFoundException e3) {
            logger.error("file : " + str + " not found", (Throwable) e3);
        }
        return hashMap;
    }

    private Map<String, String> customFields(String str) throws SAXException, IOException, ParserConfigurationException, XPathExpressionException {
        HashMap hashMap = new HashMap();
        DocumentBuilderFactory newInstance = DocumentBuilderFactory.newInstance();
        newInstance.setNamespaceAware(true);
        Document parse = newInstance.newDocumentBuilder().parse(new File(str));
        parse.getDocumentElement().normalize();
        XPath newXPath = XPathFactory.newInstance().newXPath();
        newXPath.setNamespaceContext(new FigisNamespaceContext(null));
        hashMap.put("title", XPathHelper.getValueXPath(newXPath, parse, "/fi:FIGISDoc/fi:AqRes/fi:AqResIdent/dc:Title/text()"));
        hashMap.put(ANNOTATIONS.getLocalName(ANNOTATIONS.COUNTRY), XPathHelper.getMultiValuesXPath(newXPath, parse, "/fi:FIGISDoc/fi:AqRes/fi:WaterAreaOverview/fi:WaterAreaRef/dc:Title[@xml:lang='en']/text()"));
        hashMap.put("species_english_name", XPathHelper.getMultiValuesXPath(newXPath, parse, "/fi:FIGISDoc/fi:AqRes/fi:AqResIdent/fi:SpeciesList/fi:SpeciesRef/dc:Title[@xml:lang=\"en\"]/text()"));
        hashMap.put("species_scientific_name", XPathHelper.getMultiValuesXPath(newXPath, parse, "/fi:FIGISDoc/fi:AqRes/fi:AqResIdent/fi:SpeciesList/fi:SpeciesRef/fi:ForeignID[@CodeSystem='scientific_name']", "Code"));
        if (XPathHelper.checkNodeExists(newXPath, parse, "/fi:FIGISDoc/fi:AqRes/fi:Management[@ManagementUnit='true']").booleanValue()) {
            hashMap.put(ANNOTATIONS.getLocalName(ANNOTATIONS.MANAGEMENT), "true");
        }
        if (XPathHelper.checkNodeExists(newXPath, parse, "/fi:FIGISDoc/fi:AqRes/fi:AqResIdent[@Factsheet='true']/fi:ReportingYear/text()").booleanValue()) {
            hashMap.put(ANNOTATIONS.getLocalName(ANNOTATIONS.YEAR), XPathHelper.getStringAttribute(newXPath, parse, "/fi:FIGISDoc/fi:AqRes/fi:AqResIdent[@Factsheet='true']/fi:ReportingYear/text()"));
        }
        hashMap.put(ANNOTATIONS.getLocalName(ANNOTATIONS.STATUS), XPathHelper.getStringAttribute(newXPath, parse, "/fi:FIGISDoc/fi:AqRes/fi:AqResStateTrend/fi:ExploitState/@Value"));
        logger.info("extracted fields : " + hashMap);
        return hashMap;
    }

    @Override // org.gcube.textextractor.extractors.InformationExtractor
    public String convertInfoToRowset(Map<String, String> map) {
        return ExtractorHelper.createRowseFromFields(map.get("documentID"), InformationExtractor.collectionID, InformationExtractor.idxType, map.get("language"), map);
    }

    private void annotate(String str, Map<String, List<String>> map) throws FileNotFoundException {
        AnnotationBase annotationBase = AnnotationBase.getInstance();
        for (Map.Entry<String, List<String>> entry : map.entrySet()) {
            if (entry.getKey().equals(ANNOTATIONS.getLocalName(ANNOTATIONS.COUNTRY) + "_uris")) {
                Iterator<String> it = entry.getValue().iterator();
                while (it.hasNext()) {
                    annotationBase.FIRMS_country(str, it.next());
                }
            } else if (entry.getKey().equals(ANNOTATIONS.getLocalName(ANNOTATIONS.SPECIES) + "_uris")) {
                Iterator<String> it2 = entry.getValue().iterator();
                while (it2.hasNext()) {
                    annotationBase.FIRMS_species(str, it2.next());
                }
            } else if (entry.getKey().equals(ANNOTATIONS.getLocalName(ANNOTATIONS.GEAR) + "_uris")) {
                Iterator<String> it3 = entry.getValue().iterator();
                while (it3.hasNext()) {
                    annotationBase.FIRMS_gear(str, it3.next());
                }
            } else if (entry.getKey().equals(ANNOTATIONS.getLocalName(ANNOTATIONS.MANAGEMENT) + "_uris")) {
                Iterator<String> it4 = entry.getValue().iterator();
                while (it4.hasNext()) {
                    annotationBase.FIRMS_management(str, it4.next());
                }
            } else if (entry.getKey().equals(ANNOTATIONS.getLocalName(ANNOTATIONS.YEAR) + "_uris")) {
                Iterator<String> it5 = entry.getValue().iterator();
                while (it5.hasNext()) {
                    annotationBase.FIRMS_year(str, it5.next());
                }
            } else if (entry.getKey().equals(ANNOTATIONS.getLocalName(ANNOTATIONS.STATUS) + "_uris")) {
                Iterator<String> it6 = entry.getValue().iterator();
                while (it6.hasNext()) {
                    annotationBase.FIRMS_status(str, it6.next());
                }
            }
        }
    }
}
