package org.archive.modules.extractor;

import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import org.apache.commons.lang.StringUtils;
import org.archive.io.ReplayCharSequence;
import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
import org.archive.util.TextUtils;

/* loaded from: input_file:WEB-INF/lib/heritrix-modules-3.1.0.jar:org/archive/modules/extractor/HTTPContentDigest.class */
public class HTTPContentDigest extends Processor {
    private static final long serialVersionUID = 3;
    private static Logger logger = Logger.getLogger(HTTPContentDigest.class.getName());
    private static final String SHA1 = "SHA1";

    public String getStripRegex() {
        return (String) this.kp.get("stripRegex");
    }

    public void setStripRegex(String str) {
        this.kp.put("stripRegex", str);
    }

    public long getMaxSizeToDigest() {
        return ((Long) this.kp.get("maxSizeToDigest")).longValue();
    }

    public void setMaxSizeToDigest(long j) {
        this.kp.put("maxSizeToDigest", Long.valueOf(j));
    }

    public HTTPContentDigest() {
        setStripRegex("");
        setMaxSizeToDigest(1048576L);
    }

    @Override // org.archive.modules.Processor
    protected boolean shouldProcess(CrawlURI crawlURI) {
        if (!crawlURI.getContentType().startsWith("text")) {
            return false;
        }
        long maxSizeToDigest = getMaxSizeToDigest();
        return maxSizeToDigest <= -1 || maxSizeToDigest >= crawlURI.getContentSize();
    }

    @Override // org.archive.modules.Processor
    protected void innerProcess(CrawlURI crawlURI) throws InterruptedException {
        String replaceAll;
        String stripRegex = getStripRegex();
        try {
            ReplayCharSequence contentReplayCharSequence = crawlURI.getRecorder().getContentReplayCharSequence();
            try {
                MessageDigest messageDigest = MessageDigest.getInstance(SHA1);
                messageDigest.reset();
                if (StringUtils.isEmpty(stripRegex)) {
                    replaceAll = contentReplayCharSequence.toString();
                } else {
                    Matcher matcher = TextUtils.getMatcher(stripRegex, contentReplayCharSequence);
                    replaceAll = matcher.replaceAll(" ");
                    TextUtils.recycleMatcher(matcher);
                }
                messageDigest.update(replaceAll.getBytes());
                crawlURI.setContentDigest(SHA1, messageDigest.digest());
            } catch (NoSuchAlgorithmException e) {
                e.printStackTrace();
            }
        } catch (Exception e2) {
            crawlURI.getNonFatalFailures().add(e2);
            logger.warning("Failed get of replay char sequence " + crawlURI.toString() + " " + e2.getMessage() + " " + Thread.currentThread().getName());
        }
    }
}
