package eu.dnetlib.iis.metadataextraction;

import com.itextpdf.text.exceptions.InvalidPdfException;
import eu.dnetlib.iis.audit.schemas.Cause;
import eu.dnetlib.iis.audit.schemas.Fault;
import eu.dnetlib.iis.core.javamapreduce.MultipleOutputs;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.avro.mapred.AvroKey;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.log4j.Logger;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
import pl.edu.icm.cermine.ContentExtractor;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.exception.TransformationException;

/* loaded from: input_file:eu/dnetlib/iis/metadataextraction/AbstractMetadataExtractorMapper.class */
public abstract class AbstractMetadataExtractorMapper<T> extends Mapper<AvroKey<T>, NullWritable, NullWritable, NullWritable> {
    public static final String LOG_FAULT_PROCESSING_TIME_THRESHOLD_SECS = "log.fault.processing.time.threshold.secs";
    public static final String FAULT_CODE_PROCESSING_TIME_THRESHOLD_EXCEEDED = "ProcessingTimeThresholdExceeded";
    public static final String FAULT_SUPPLEMENTARY_DATA_PROCESSING_TIME = "processing_time";
    public static final String FAULT_SUPPLEMENTARY_DATA_URL = "url";
    public static final String IMPORT_CONTENT_CONNECTION_TIMEOUT = "import.content.connection.timeout";
    public static final String IMPORT_CONTENT_READ_TIMEOUT = "import.content.read.timeout";
    public static final String IMPORT_CONTENT_MAX_FILE_SIZE_MB = "import.content.max.file.size.mb";
    protected String namedOutputMeta;
    protected String namedOutputPlaintext;
    protected String namedOutputFault;
    protected Set<String> excludedIds;
    protected final Logger log = Logger.getLogger(AbstractMetadataExtractorMapper.class);
    protected boolean analysisExceptionAsCritical = false;
    protected boolean otherExceptionAsCritical = false;
    protected MultipleOutputs mos = null;
    protected int progresLogInterval = 100;
    protected int currentProgress = 0;
    private long intervalTime = 0;
    protected long maxFileSizeKB = Long.MAX_VALUE;
    protected long processingTimeThreshold = Long.MAX_VALUE;

    /* JADX INFO: Access modifiers changed from: protected */
    public void setup(Mapper<AvroKey<T>, NullWritable, NullWritable, NullWritable>.Context context) throws IOException, InterruptedException {
        this.namedOutputMeta = context.getConfiguration().get("output.meta");
        if (this.namedOutputMeta == null || this.namedOutputMeta.isEmpty()) {
            throw new RuntimeException("no named output provided for metadata");
        }
        this.namedOutputPlaintext = context.getConfiguration().get("output.plaintext");
        if (this.namedOutputPlaintext == null || this.namedOutputPlaintext.isEmpty()) {
            throw new RuntimeException("no named output provided for plaintext");
        }
        this.namedOutputFault = context.getConfiguration().get("output.fault");
        if (this.namedOutputFault == null || this.namedOutputFault.isEmpty()) {
            throw new RuntimeException("no named output provided for fault");
        }
        String str = context.getConfiguration().get("excluded.ids");
        if (str == null || str.trim().isEmpty() || "$UNDEFINED$".equals(str)) {
            this.log.warn("got no excluded ids");
        } else {
            this.log.warn("got excluded ids: " + str);
            this.excludedIds = new HashSet(Arrays.asList(StringUtils.split(str.trim(), ',')));
        }
        String str2 = context.getConfiguration().get(IMPORT_CONTENT_MAX_FILE_SIZE_MB);
        if (str2 != null && !str2.trim().isEmpty() && !"$UNDEFINED$".equals(str2)) {
            this.maxFileSizeKB = 1024 * Integer.valueOf(str2).intValue();
        }
        String str3 = context.getConfiguration().get(LOG_FAULT_PROCESSING_TIME_THRESHOLD_SECS);
        if (str3 != null && !str3.trim().isEmpty() && !"$UNDEFINED$".equals(str3)) {
            this.processingTimeThreshold = 1000 * Integer.valueOf(str3).intValue();
        }
        this.mos = new MultipleOutputs(context);
        this.currentProgress = 0;
        this.intervalTime = System.currentTimeMillis();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    /* JADX WARN: Finally extract failed */
    public void processStream(CharSequence charSequence, InputStream inputStream, long j, Map<CharSequence, CharSequence> map) throws IOException, InterruptedException {
        try {
            this.currentProgress++;
            if (this.currentProgress > 0 && this.currentProgress % this.progresLogInterval == 0) {
                this.log.warn("metadata extaction progress: " + this.currentProgress + ", time taken to process " + this.progresLogInterval + " elements: " + ((System.currentTimeMillis() - this.intervalTime) / 1000) + " secs");
                this.intervalTime = System.currentTimeMillis();
            }
        } catch (AnalysisException e) {
            this.log.debug("closing multiple outputs...");
            this.mos.close();
            this.log.debug("multiple outputs closed");
            throw new RuntimeException((Throwable) e);
        }
        if (this.excludedIds == null || !this.excludedIds.contains(charSequence)) {
            if (j > this.maxFileSizeKB) {
                this.log.warn("skipping processing for id " + ((Object) charSequence) + " due to max file size limit=" + this.maxFileSizeKB + " KB exceeded: " + j + " KB");
                try {
                    try {
                        this.mos.write(this.namedOutputMeta, new AvroKey(NlmToDocumentWithBasicMetadataConverter.convertFull(charSequence.toString(), null)));
                        this.mos.write(this.namedOutputPlaintext, new AvroKey(NlmToDocumentContentConverter.convert(charSequence.toString(), null)));
                        if (inputStream != null) {
                            inputStream.close();
                            return;
                        }
                        return;
                    } catch (Throwable th) {
                        if (inputStream != null) {
                            inputStream.close();
                        }
                        throw th;
                    }
                } catch (TransformationException e2) {
                    this.log.debug("closing multiple outputs...");
                    this.mos.close();
                    this.log.debug("multiple outputs closed");
                    throw new RuntimeException((Throwable) e2);
                } catch (JDOMException e3) {
                    this.log.debug("closing multiple outputs...");
                    this.mos.close();
                    this.log.debug("multiple outputs closed");
                    throw new RuntimeException((Throwable) e3);
                }
            }
            this.log.warn("starting processing for id: " + ((Object) charSequence));
            long currentTimeMillis = System.currentTimeMillis();
            ContentExtractor contentExtractor = new ContentExtractor();
            try {
                contentExtractor.uploadPDF(inputStream);
                try {
                    Element nLMContent = contentExtractor.getNLMContent();
                    Document document = new Document(nLMContent);
                    this.log.debug("got NLM content: \n" + new XMLOutputter(Format.getPrettyFormat()).outputString(nLMContent));
                    this.mos.write(this.namedOutputMeta, new AvroKey(NlmToDocumentWithBasicMetadataConverter.convertFull(charSequence.toString(), document)));
                } catch (JDOMException e4) {
                    this.log.debug("closing multiple outputs...");
                    this.mos.close();
                    this.log.debug("multiple outputs closed");
                    throw new RuntimeException((Throwable) e4);
                } catch (Exception e5) {
                    if (this.otherExceptionAsCritical) {
                        this.log.debug("closing multiple outputs...");
                        this.mos.close();
                        this.log.debug("multiple outputs closed");
                        throw new RuntimeException(e5);
                    }
                    this.log.error("got unexpected exception, just logging", e5);
                    try {
                        this.mos.write(this.namedOutputMeta, new AvroKey(NlmToDocumentWithBasicMetadataConverter.convertFull(charSequence.toString(), null)));
                        this.mos.write(this.namedOutputFault, new AvroKey(exceptionToFault(charSequence, e5, map)));
                    } catch (TransformationException e6) {
                        this.log.debug("closing multiple outputs...");
                        this.mos.close();
                        this.log.debug("multiple outputs closed");
                        throw new RuntimeException((Throwable) e6);
                    } catch (JDOMException e7) {
                        this.log.debug("closing multiple outputs...");
                        this.mos.close();
                        this.log.debug("multiple outputs closed");
                        throw new RuntimeException((Throwable) e7);
                    }
                } catch (AnalysisException e8) {
                    if (this.analysisExceptionAsCritical) {
                        this.log.debug("closing multiple outputs...");
                        this.mos.close();
                        this.log.debug("multiple outputs closed");
                        throw new RuntimeException((Throwable) e8);
                    }
                    if (e8.getCause() instanceof InvalidPdfException) {
                        this.log.error("Invalid PDF file", e8);
                    } else {
                        this.log.error("got unexpected analysis exception, just logging", e8);
                    }
                    try {
                        this.mos.write(this.namedOutputMeta, new AvroKey(NlmToDocumentWithBasicMetadataConverter.convertFull(charSequence.toString(), null)));
                        this.mos.write(this.namedOutputFault, new AvroKey(exceptionToFault(charSequence, e8, map)));
                    } catch (TransformationException e9) {
                        this.log.debug("closing multiple outputs...");
                        this.mos.close();
                        this.log.debug("multiple outputs closed");
                        throw new RuntimeException((Throwable) e9);
                    } catch (JDOMException e10) {
                        this.log.debug("closing multiple outputs...");
                        this.mos.close();
                        this.log.debug("multiple outputs closed");
                        throw new RuntimeException((Throwable) e10);
                    }
                } catch (TransformationException e11) {
                    this.log.debug("closing multiple outputs...");
                    this.mos.close();
                    this.log.debug("multiple outputs closed");
                    throw new RuntimeException((Throwable) e11);
                }
                try {
                    this.mos.write(this.namedOutputPlaintext, new AvroKey(NlmToDocumentContentConverter.convert(charSequence.toString(), contentExtractor.getRawFullText())));
                } catch (AnalysisException e12) {
                    if (this.analysisExceptionAsCritical) {
                        this.log.debug("closing multiple outputs...");
                        this.mos.close();
                        this.log.debug("multiple outputs closed");
                        throw new RuntimeException((Throwable) e12);
                    }
                    if (e12.getCause() instanceof InvalidPdfException) {
                        this.log.error("Invalid PDF file when retrieving plaintext", e12);
                    } else {
                        this.log.error("got unexpected analysis exception when retrieving plaintext, just logging", e12);
                    }
                    this.mos.write(this.namedOutputPlaintext, new AvroKey(NlmToDocumentContentConverter.convert(charSequence.toString(), null)));
                    this.mos.write(this.namedOutputFault, new AvroKey(exceptionToFault(charSequence, e12, map)));
                }
                if (inputStream != null) {
                    inputStream.close();
                }
                long currentTimeMillis2 = System.currentTimeMillis() - currentTimeMillis;
                if (currentTimeMillis2 > this.processingTimeThreshold) {
                    HashMap hashMap = new HashMap();
                    if (map != null) {
                        hashMap.putAll(map);
                    }
                    hashMap.put(FAULT_SUPPLEMENTARY_DATA_PROCESSING_TIME, String.valueOf(currentTimeMillis2));
                    this.mos.write(this.namedOutputFault, new AvroKey(Fault.newBuilder().setInputObjectId(charSequence).setTimestamp(System.currentTimeMillis()).setCode(FAULT_CODE_PROCESSING_TIME_THRESHOLD_EXCEEDED).setSupplementaryData(hashMap).build()));
                }
                this.log.warn("finished processing for id " + ((Object) charSequence) + " in " + (currentTimeMillis2 / 1000) + " secs");
            } catch (Throwable th2) {
                if (inputStream != null) {
                    inputStream.close();
                }
                throw th2;
            }
            this.log.debug("closing multiple outputs...");
            this.mos.close();
            this.log.debug("multiple outputs closed");
            throw new RuntimeException((Throwable) e);
        }
        this.log.warn("skipping processing for excluded id " + ((Object) charSequence));
    }

    protected static Fault exceptionToFault(CharSequence charSequence, Throwable th, Map<CharSequence, CharSequence> map) {
        Fault.Builder newBuilder = Fault.newBuilder();
        newBuilder.setInputObjectId(charSequence);
        newBuilder.setTimestamp(System.currentTimeMillis());
        newBuilder.setCode(th.getClass().getName());
        newBuilder.setMessage(th.getMessage());
        StringWriter stringWriter = new StringWriter();
        PrintWriter printWriter = new PrintWriter(stringWriter);
        th.printStackTrace(printWriter);
        printWriter.close();
        newBuilder.setStackTrace(stringWriter.toString());
        if (th.getCause() != null) {
            newBuilder.setCauses(appendThrowableToCauses(th.getCause(), new ArrayList()));
        }
        if (map != null && !map.isEmpty()) {
            newBuilder.setSupplementaryData(map);
        }
        return newBuilder.build();
    }

    protected static List<Cause> appendThrowableToCauses(Throwable th, List<Cause> list) {
        Cause.Builder newBuilder = Cause.newBuilder();
        newBuilder.setCode(th.getClass().getName());
        newBuilder.setMessage(th.getMessage());
        list.add(newBuilder.build());
        return th.getCause() != null ? appendThrowableToCauses(th.getCause(), list) : list;
    }

    public void cleanup(Mapper<AvroKey<T>, NullWritable, NullWritable, NullWritable>.Context context) throws IOException, InterruptedException {
        this.log.debug("cleanup: closing multiple outputs...");
        this.mos.close();
        this.log.debug("cleanup: multiple outputs closed");
    }

    public void setAnalysisExceptionAsCritical(boolean z) {
        this.analysisExceptionAsCritical = z;
    }

    public void setOtherExceptionAsCritical(boolean z) {
        this.otherExceptionAsCritical = z;
    }

    public void setProgresLogInterval(int i) {
        this.progresLogInterval = i;
    }
}
