package hwu.elixir.scrape.scraper;

import com.fasterxml.jackson.databind.ObjectMapper;
import hwu.elixir.scrape.exceptions.CannotWriteException;
import hwu.elixir.scrape.exceptions.FourZeroFourException;
import hwu.elixir.scrape.exceptions.JsonLDInspectionException;
import hwu.elixir.scrape.exceptions.MissingHTMLException;
import hwu.elixir.scrape.exceptions.MissingMarkupException;
import hwu.elixir.scrape.exceptions.NTriplesParsingException;
import hwu.elixir.utils.Helpers;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Random;
import javax.annotation.Nullable;
import org.apache.any23.source.StringDocumentSource;
import org.eclipse.rdf4j.model.BNode;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.model.util.ModelBuilder;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.Rio;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:hwu/elixir/scrape/scraper/ScraperFilteredCore.class */
public class ScraperFilteredCore extends ScraperCore {
    private static Logger logger = LoggerFactory.getLogger(ScraperFilteredCore.class.getName());
    private int countOfJSONLD = 0;

    public boolean scrape(String str, String str2, String str3, Long l, @Nullable Boolean bool) throws FourZeroFourException, JsonLDInspectionException, CannotWriteException, MissingMarkupException {
        String wrapHTMLExtractionStatic;
        String fixURL = fixURL(str);
        if (bool.booleanValue()) {
            logger.info("dynamic scraping setting");
            wrapHTMLExtractionStatic = wrapHTMLExtraction(fixURL);
        } else {
            logger.info("static scraping setting");
            wrapHTMLExtractionStatic = wrapHTMLExtractionStatic(fixURL);
        }
        if (wrapHTMLExtractionStatic == null || wrapHTMLExtractionStatic.contentEquals("")) {
            return false;
        }
        if (logger.isTraceEnabled()) {
            logger.trace("Read following html ==============================================================");
            logger.trace(wrapHTMLExtractionStatic);
        }
        try {
            String injectId = injectId(wrapHTMLExtractionStatic, fixURL);
            if (logger.isTraceEnabled()) {
                logger.trace("Same HTML after injecting ID ==============================================================");
                logger.trace(injectId);
            }
            StringDocumentSource stringDocumentSource = new StringDocumentSource(injectId, fixURL);
            IRI createIRI = SimpleValueFactory.getInstance().createIRI(stringDocumentSource.getDocumentIRI());
            String triplesInNTriples = getTriplesInNTriples(stringDocumentSource);
            if (triplesInNTriples == null) {
                throw new MissingMarkupException(fixURL);
            }
            try {
                Model processTriples = processTriples(triplesInNTriples, createIRI, l);
                if (processTriples == null) {
                    return false;
                }
                File file = new File(str2);
                if (!file.exists()) {
                    file.mkdir();
                }
                String str4 = str3 == null ? str2 + "/" + l + ".nq" : str2 + "/" + str3 + ".nq";
                try {
                    PrintWriter printWriter = new PrintWriter(new File(str4));
                    Throwable th = null;
                    try {
                        try {
                            Rio.write(processTriples, printWriter, RDFFormat.NQUADS);
                            if (printWriter != null) {
                                if (0 != 0) {
                                    try {
                                        printWriter.close();
                                    } catch (Throwable th2) {
                                        th.addSuppressed(th2);
                                    }
                                } else {
                                    printWriter.close();
                                }
                            }
                            if (new File(str4).exists()) {
                                return true;
                            }
                            System.exit(0);
                            return true;
                        } catch (Throwable th3) {
                            th = th3;
                            throw th3;
                        }
                    } finally {
                    }
                } catch (Exception e) {
                    logger.error("Problem writing file for " + fixURL, e);
                    throw new CannotWriteException(fixURL);
                }
            } catch (NTriplesParsingException e2) {
                logger.error("Failed to process triples into model; the NTriples generated from the URL (" + fixURL + ") could not be parsed into a model.");
                return false;
            }
        } catch (MissingHTMLException e3) {
            logger.error(e3.toString());
            return false;
        }
    }

    protected Model processTriples(String str, IRI iri, Long l) throws NTriplesParsingException {
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyyMMdd");
        Date date = new Date();
        String localName = iri.getLocalName();
        String obj = iri.toString();
        int indexOf = obj.indexOf("//") + 2;
        String substring = obj.substring(indexOf, obj.indexOf(".", indexOf));
        if (substring.equalsIgnoreCase("www")) {
            int indexOf2 = obj.indexOf("www") + 4;
            substring = obj.substring(indexOf2, obj.indexOf(".", indexOf2));
        }
        if (localName.indexOf(46) != -1) {
            localName = localName.substring(0, localName.indexOf(46));
        }
        String concat = "https://bioschemas.org/crawl/v1/".concat(substring + "/" + localName + "/").concat(simpleDateFormat.format(date) + "/");
        StringBuilder append = new StringBuilder().append(concat);
        Long.valueOf(l.longValue() + 1);
        String sb = append.append(l).toString();
        ModelBuilder modelBuilder = new ModelBuilder();
        modelBuilder.setNamespace(" ", concat);
        modelBuilder.setNamespace("bsc", concat);
        IRI createIRI = SimpleValueFactory.getInstance().createIRI("https://github.com/HW-SWeL/BMUSE/releases/tag/" + this.properties.getScraperVersion());
        modelBuilder.defaultGraph().add(sb, "http://purl.org/pav/retrievedFrom", iri);
        modelBuilder.defaultGraph().add(sb, "http://purl.org/pav/retrievedOn", Helpers.getFullDateWithTime());
        modelBuilder.defaultGraph().add(sb, "http://purl.org/pav/createdWith", createIRI);
        HashMap hashMap = new HashMap();
        for (Statement statement : createModelFromNTriples(str)) {
            Resource subject = statement.getSubject();
            IRI predicate = statement.getPredicate();
            Value object = statement.getObject();
            if (!predicate.stringValue().contains("vocab.sindice") && !predicate.stringValue().contains("xhtml/vocab") && !predicate.stringValue().contains("nofollow") && !predicate.stringValue().contains("ogp.me")) {
                IRI fixPredicate = fixPredicate(predicate);
                Value fixObject = fixObject(object);
                if (subject instanceof BNode) {
                    if (hashMap.containsKey(subject.stringValue())) {
                        subject = SimpleValueFactory.getInstance().createIRI((String) hashMap.get(subject.stringValue()));
                    } else {
                        Resource iriGenerator = iriGenerator(sb, iri);
                        hashMap.put(subject.stringValue(), iriGenerator.stringValue());
                        subject = iriGenerator;
                    }
                }
                if (fixObject instanceof BNode) {
                    if (hashMap.containsKey(fixObject.stringValue())) {
                        fixObject = SimpleValueFactory.getInstance().createIRI((String) hashMap.get(fixObject.stringValue()));
                    } else {
                        Value iriGenerator2 = iriGenerator(sb, iri);
                        hashMap.put(fixObject.stringValue(), iriGenerator2.stringValue());
                        fixObject = iriGenerator2;
                    }
                }
                modelBuilder.namedGraph(sb).add(subject, fixPredicate, fixObject);
            }
        }
        return modelBuilder.build();
    }

    protected IRI iriGenerator(String str, IRI iri) {
        String replaceAll = iri.toString().indexOf("https://") != -1 ? iri.toString().replaceAll("https://", "") : iri.toString().replaceAll("http://", "");
        if (!replaceAll.endsWith("/") && !replaceAll.endsWith("#")) {
            replaceAll = replaceAll + "/";
        }
        return SimpleValueFactory.getInstance().createIRI(str + "/" + replaceAll + Math.abs(new Random().nextInt()));
    }

    protected String injectId(String str, String str2) throws MissingHTMLException, JsonLDInspectionException {
        this.countOfJSONLD = 0;
        if (str2 == null) {
            throw new IllegalArgumentException("url cannot be null");
        }
        if (str == null) {
            throw new MissingHTMLException(str2);
        }
        if (str.indexOf("@context") != -1 || (str.indexOf("vocab=\"http://schema.org") == -1 && str.indexOf("vocab=\"https://schema.org") == -1)) {
            return fixAllJsonLdBlocks(str, str2);
        }
        logger.info("No @context, but a vocab; appears to be RDFa with no JSON-LD: " + str2);
        return str;
    }

    protected String fixAllJsonLdBlocks(String str, String str2) throws JsonLDInspectionException {
        String[] onlyUnfilteredJSONLDFromHtml;
        if (str.startsWith("{")) {
            logger.info("Just JSON no HTML from: " + str2);
            onlyUnfilteredJSONLDFromHtml = new String[]{str};
        } else {
            onlyUnfilteredJSONLDFromHtml = getOnlyUnfilteredJSONLDFromHtml(str);
        }
        logger.debug("Number of JSONLD sections: " + onlyUnfilteredJSONLDFromHtml.length);
        for (String str3 : onlyUnfilteredJSONLDFromHtml) {
            String fixASingleJsonLdBlock = fixASingleJsonLdBlock(str3, str2);
            if (!fixASingleJsonLdBlock.equalsIgnoreCase(str3)) {
                str = swapJsonLdMarkup(str, str3, fixASingleJsonLdBlock);
                this.countOfJSONLD++;
            }
        }
        return str;
    }

    protected String fixASingleJsonLdBlock(String str, String str2) throws JsonLDInspectionException {
        try {
            Object parse = new JSONParser().parse(str);
            if (parse instanceof JSONArray) {
                return fixAJsonLdArray((JSONArray) parse, str2);
            }
            if (parse instanceof JSONObject) {
                return fixASingleJsonLdObject((JSONObject) parse, str2);
            }
            throw new JsonLDInspectionException("Unkown object obtained from JSON parser :" + str2);
        } catch (ParseException e) {
            throw new JsonLDInspectionException("JSON ParseException. Failed to parse JSON from :" + str2 + " Parser Error: " + e);
        }
    }

    protected String fixAJsonLdArray(JSONArray jSONArray, String str) {
        for (int i = 0; i < jSONArray.size(); i++) {
            JSONObject fixASingleJSONLdObject = fixASingleJSONLdObject((JSONObject) jSONArray.get(i), str);
            jSONArray.remove(i);
            jSONArray.add(i, fixASingleJSONLdObject);
        }
        return jSONArray.toJSONString().replaceAll("\\\\", "");
    }

    protected String fixASingleJsonLdObject(JSONObject jSONObject, String str) {
        return fixASingleJSONLdObject(jSONObject, str).toJSONString().replace("\\\\", "");
    }

    protected JSONObject fixASingleJSONLdObject(JSONObject jSONObject, String str) {
        if (!isJSONValid(jSONObject.toString())) {
            logger.error("invalid JSON-LD syntax");
        }
        if (jSONObject.containsKey("@context")) {
            String obj = jSONObject.get("@context").toString();
            if (!obj.equalsIgnoreCase("https://schema.org")) {
                jSONObject.remove("@context");
                jSONObject.put("@context", this.properties.getSchemaContext());
            }
            if (obj.equalsIgnoreCase("https://schema.org")) {
                jSONObject.remove("@context");
                jSONObject.put("@context", this.properties.getSchemaContext());
            }
            jSONObject.get("@context").toString();
        } else {
            jSONObject.put("@context", this.properties.getSchemaContext());
        }
        if (!jSONObject.containsKey("@id")) {
            if (this.countOfJSONLD > 0) {
                jSONObject.put("@id", str + "-" + this.countOfJSONLD);
            } else {
                jSONObject.put("@id", str);
            }
        }
        return jSONObject;
    }

    protected String swapJsonLdMarkup(String str, String str2, String str3) {
        int indexOf = str.indexOf(str2);
        return str.substring(0, indexOf) + str3 + str.substring(indexOf + str2.length());
    }

    private static boolean isJSONValid(String str) {
        try {
            new ObjectMapper().readTree(str);
            return true;
        } catch (IOException e) {
            logger.error("INVALID JSON-LD SYNTAX");
            return false;
        }
    }
}
