package eu.dnetlib.iis.importer.content;

import eu.dnetlib.iis.core.java.HadoopContext;
import eu.dnetlib.iis.core.java.PortBindings;
import eu.dnetlib.iis.core.java.Process;
import eu.dnetlib.iis.core.java.io.CloseableIterator;
import eu.dnetlib.iis.core.java.io.DataStore;
import eu.dnetlib.iis.core.java.io.FileSystemPath;
import eu.dnetlib.iis.core.java.porttype.AvroPortType;
import eu.dnetlib.iis.core.java.porttype.PortType;
import eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl;
import eu.dnetlib.iis.metadataextraction.schemas.DocumentText;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.avro.file.DataFileWriter;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;

/* loaded from: input_file:eu/dnetlib/iis/importer/content/DocumentTextUrlBasedImporterProcess.class */
public class DocumentTextUrlBasedImporterProcess implements Process {
    private final Logger log = Logger.getLogger(DocumentTextUrlBasedImporterProcess.class);
    private static final String contentUrlPort = "content_url";
    private static final String textPort = "text";
    private int connectionTimeout;
    private int readTimeout;

    public Map<String, PortType> getInputPorts() {
        return createInputPorts();
    }

    public Map<String, PortType> getOutputPorts() {
        return createOutputPorts();
    }

    private static HashMap<String, PortType> createInputPorts() {
        HashMap<String, PortType> hashMap = new HashMap<>();
        hashMap.put(contentUrlPort, new AvroPortType(DocumentContentUrl.SCHEMA$));
        return hashMap;
    }

    private static HashMap<String, PortType> createOutputPorts() {
        HashMap<String, PortType> hashMap = new HashMap<>();
        hashMap.put(textPort, new AvroPortType(DocumentText.SCHEMA$));
        return hashMap;
    }

    public void run(PortBindings portBindings, HadoopContext hadoopContext, Map<String, String> map) throws IOException {
        this.readTimeout = map.containsKey("import.content.read.timeout") ? Integer.valueOf(map.get("import.content.read.timeout")).intValue() : hadoopContext.getConfiguration().getInt("import.content.read.timeout", 60000);
        this.connectionTimeout = map.containsKey("import.content.connection.timeout") ? Integer.valueOf(map.get("import.content.connection.timeout")).intValue() : hadoopContext.getConfiguration().getInt("import.content.connection.timeout", 60000);
        Map input = portBindings.getInput();
        Map output = portBindings.getOutput();
        FileSystem fileSystem = FileSystem.get(hadoopContext.getConfiguration());
        CloseableIterator reader = DataStore.getReader(new FileSystemPath(fileSystem, (Path) input.get(contentUrlPort)));
        DataFileWriter create = DataStore.create(new FileSystemPath(fileSystem, (Path) output.get(textPort)), DocumentText.SCHEMA$);
        try {
            int i = 0;
            long currentTimeMillis = System.currentTimeMillis();
            while (reader.hasNext()) {
                DocumentContentUrl documentContentUrl = (DocumentContentUrl) reader.next();
                long currentTimeMillis2 = System.currentTimeMillis();
                byte[] contentFromURL = ObjectStoreContentProviderUtils.getContentFromURL(documentContentUrl.getUrl().toString(), this.connectionTimeout, this.readTimeout);
                this.log.warn("text content retrieval for id: " + ((Object) documentContentUrl.getId()) + " and location: " + ((Object) documentContentUrl.getUrl()) + " took: " + (System.currentTimeMillis() - currentTimeMillis2) + " ms, got text content: " + (contentFromURL != null && contentFromURL.length > 0));
                if (i % 10000 == 0) {
                    this.log.warn("retrived " + i + " records, last 10000 batch in " + ((System.currentTimeMillis() - currentTimeMillis) / 1000) + " secs");
                    currentTimeMillis = System.currentTimeMillis();
                }
                DocumentText.Builder newBuilder = DocumentText.newBuilder();
                newBuilder.setId(documentContentUrl.getId());
                if (contentFromURL != null) {
                    newBuilder.setText(new String(contentFromURL, "utf-8"));
                }
                create.append(newBuilder.build());
                create.flush();
                i++;
            }
        } finally {
            reader.close();
            create.close();
        }
    }
}
