package org.archive.modules.extractor;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.logging.Logger;
import org.apache.commons.httpclient.URIException;
import org.archive.io.SinkHandlerLogThread;
import org.archive.modules.CrawlURI;
import org.archive.net.UURIFactory;
import org.archive.util.FileUtils;

/* loaded from: input_file:WEB-INF/lib/heritrix-modules-3.1.0.jar:org/archive/modules/extractor/ExtractorPDF.class */
public class ExtractorPDF extends ContentExtractor {
    private static final long serialVersionUID = 3;
    private static final Logger LOGGER = Logger.getLogger(ExtractorPDF.class.getName());

    public long getMaxSizeToParse() {
        return ((Long) this.kp.get("maxSizeToParse")).longValue();
    }

    public void setMaxSizeToParse(long j) {
        this.kp.put("maxSizeToParse", Long.valueOf(j));
    }

    public ExtractorPDF() {
        setMaxSizeToParse(10485760L);
    }

    @Override // org.archive.modules.extractor.ContentExtractor
    protected boolean shouldExtract(CrawlURI crawlURI) {
        String contentType;
        return crawlURI.getRecorder().getRecordedInput().getSize() <= getMaxSizeToParse() && (contentType = crawlURI.getContentType()) != null && contentType.startsWith("application/pdf");
    }

    @Override // org.archive.modules.extractor.ContentExtractor
    protected boolean innerExtract(CrawlURI crawlURI) {
        Object currentThread = Thread.currentThread();
        try {
            File createTempFile = File.createTempFile("tt" + (currentThread instanceof SinkHandlerLogThread ? ((SinkHandlerLogThread) currentThread).getSerialNumber() : System.identityHashCode(currentThread)), "tmp.pdf");
            try {
                crawlURI.getRecorder().copyContentBodyTo(createTempFile);
                ArrayList<String> extractURIs = new PDFParser(createTempFile.getAbsolutePath()).extractURIs();
                if (extractURIs == null) {
                    return true;
                }
                Iterator<String> it = extractURIs.iterator();
                while (it.hasNext()) {
                    String next = it.next();
                    try {
                        crawlURI.getOutLinks().add(new Link(crawlURI.getUURI(), UURIFactory.getInstance(next), LinkContext.NAVLINK_MISC, Hop.NAVLINK));
                    } catch (URIException e) {
                        logUriError(e, crawlURI.getUURI(), next);
                    }
                }
                this.numberOfLinksExtracted.addAndGet(extractURIs.size());
                LOGGER.fine(crawlURI + " has " + extractURIs.size() + " links.");
                return true;
            } catch (IOException e2) {
                crawlURI.getNonFatalFailures().add(e2);
                return false;
            } catch (RuntimeException e3) {
                crawlURI.getNonFatalFailures().add(e3);
                return false;
            } finally {
                FileUtils.deleteSoonerOrLater(createTempFile);
            }
        } catch (IOException e4) {
            throw new RuntimeException(e4);
        }
    }
}
