package eu.dnetlib.iis.referenceextraction.dataset.importer;

import eu.dnetlib.iis.core.java.PortBindings;
import eu.dnetlib.iis.core.java.Process;
import eu.dnetlib.iis.core.java.ProcessUtils;
import eu.dnetlib.iis.core.java.io.DataStore;
import eu.dnetlib.iis.core.java.io.FileSystemPath;
import eu.dnetlib.iis.core.java.porttype.AvroPortType;
import eu.dnetlib.iis.core.java.porttype.PortType;
import eu.dnetlib.iis.referenceextraction.dataset.schemas.DocumentToDataSet;
import eu.dnetlib.iis.referenceextraction.shared.importer.SharedImporterUtils;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.security.InvalidParameterException;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import org.apache.avro.file.DataFileWriter;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;

/* loaded from: input_file:eu/dnetlib/iis/referenceextraction/dataset/importer/DocumentToDataSetImporter.class */
public class DocumentToDataSetImporter implements Process {
    private final Logger log = Logger.getLogger(getClass());
    public static final String PARAM_CSV_PATH = "import.dataset.csv.path";
    private static final String outputPortName = "output";
    private static final String skipLinePrefix = "arxiv_id";
    private static final char separatorChar = ',';

    public Map<String, PortType> getInputPorts() {
        return Collections.emptyMap();
    }

    public Map<String, PortType> getOutputPorts() {
        HashMap hashMap = new HashMap();
        hashMap.put(outputPortName, new AvroPortType(DocumentToDataSet.SCHEMA$));
        return hashMap;
    }

    public void run(PortBindings portBindings, Configuration configuration, Map<String, String> map) throws Exception {
        FileSystem fileSystem = FileSystem.get(configuration);
        String parameterValue = ProcessUtils.getParameterValue(PARAM_CSV_PATH, configuration, map);
        if (parameterValue == null || parameterValue.isEmpty()) {
            throw new InvalidParameterException("required parameter 'import.dataset.csv.path' is missing!");
        }
        DataFileWriter<DocumentToDataSet> dataFileWriter = null;
        try {
            dataFileWriter = DataStore.create(new FileSystemPath(fileSystem, (Path) portBindings.getOutput().get(outputPortName)), DocumentToDataSet.SCHEMA$);
            processNode(fileSystem, new Path(parameterValue), dataFileWriter);
            if (dataFileWriter != null) {
                dataFileWriter.close();
            }
        } catch (Throwable th) {
            if (dataFileWriter != null) {
                dataFileWriter.close();
            }
            throw th;
        }
    }

    protected void processNode(FileSystem fileSystem, Path path, DataFileWriter<DocumentToDataSet> dataFileWriter) throws Exception {
        if (fileSystem.isDirectory(path)) {
            for (FileStatus fileStatus : fileSystem.listStatus(path)) {
                processNode(fileSystem, fileStatus.getPath(), dataFileWriter);
            }
            return;
        }
        InputStream inputStream = null;
        BufferedReader bufferedReader = null;
        try {
            InputStream open = fileSystem.open(path);
            inputStream = open;
            bufferedReader = new BufferedReader(new InputStreamReader(open));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                }
                if (SharedImporterUtils.skipLine(readLine, skipLinePrefix)) {
                    this.log.warn("skipping line: " + readLine);
                } else {
                    String[] split = StringUtils.split(readLine, ',');
                    if (split.length >= 2) {
                        DocumentToDataSet.Builder newBuilder = DocumentToDataSet.newBuilder();
                        newBuilder.setDocumentId(generateDocumentId(split[0].trim()));
                        newBuilder.setDatasetId(generateDatasetId(split[1].trim()));
                        dataFileWriter.append(newBuilder.build());
                    } else {
                        this.log.warn("invalid line, unable to process: " + readLine);
                    }
                }
            }
            if (bufferedReader != null) {
                bufferedReader.close();
            }
            if (inputStream != null) {
                inputStream.close();
            }
        } catch (Throwable th) {
            if (bufferedReader != null) {
                bufferedReader.close();
            }
            if (inputStream != null) {
                inputStream.close();
            }
            throw th;
        }
    }

    private String generateDocumentId(String str) {
        return str;
    }

    private String generateDatasetId(String str) {
        return str;
    }
}
