/*
 * Decompiled with CFR 0.152.
 */
package eu.dnetlib.dhp.sx.bio.ebi;

import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.AbstractScalaApplication;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.sx.bio.ebi.SparkCreatePubmedDump$;
import eu.dnetlib.dhp.sx.bio.pubmed.PMArticle;
import eu.dnetlib.dhp.sx.bio.pubmed.PMParser2;
import eu.dnetlib.dhp.sx.bio.pubmed.PubMedToOaf$;
import eu.dnetlib.dhp.utils.DHPUtils;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import java.io.Serializable;
import org.apache.hadoop.conf.Configuration;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders$;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import scala.Function1;
import scala.Predef$;
import scala.collection.immutable.Nil$;
import scala.collection.immutable.Seq;
import scala.reflect.ScalaSignature;
import scala.runtime.BoxesRunTime;

@ScalaSignature(bytes="\u0006\u0005=4Aa\u0003\u0007\u00013!I\u0001\u0005\u0001B\u0001B\u0003%\u0011E\f\u0005\n_\u0001\u0011\t\u0011)A\u0005aQB\u0001\"\u000e\u0001\u0003\u0002\u0003\u0006IA\u000e\u0005\u0006}\u0001!\ta\u0010\u0005\u0006\u000b\u0002!\tE\u0012\u0005\u0006\u0015\u0002!\taS\u0004\u0006K2A\tA\u001a\u0004\u0006\u00171A\ta\u001a\u0005\u0006}!!\ta\u001b\u0005\u0006Y\"!\t!\u001c\u0002\u0016'B\f'o[\"sK\u0006$X\rU;c[\u0016$G)^7q\u0015\tia\"A\u0002fE&T!a\u0004\t\u0002\u0007\tLwN\u0003\u0002\u0012%\u0005\u00111\u000f\u001f\u0006\u0003'Q\t1\u0001\u001a5q\u0015\t)b#A\u0004e]\u0016$H.\u001b2\u000b\u0003]\t!!Z;\u0004\u0001M\u0011\u0001A\u0007\t\u00037yi\u0011\u0001\b\u0006\u0003;I\t1\"\u00199qY&\u001c\u0017\r^5p]&\u0011q\u0004\b\u0002\u0019\u0003\n\u001cHO]1diN\u001b\u0017\r\\1BaBd\u0017nY1uS>t\u0017\u0001\u00049s_B,'\u000f^=QCRD\u0007C\u0001\u0012,\u001d\t\u0019\u0013\u0006\u0005\u0002%O5\tQE\u0003\u0002'1\u00051AH]8pizR\u0011\u0001K\u0001\u0006g\u000e\fG.Y\u0005\u0003U\u001d\na\u0001\u0015:fI\u00164\u0017B\u0001\u0017.\u0005\u0019\u0019FO]5oO*\u0011!fJ\u0005\u0003Ay\tA!\u0019:hgB\u0019\u0011GM\u0011\u000e\u0003\u001dJ!aM\u0014\u0003\u000b\u0005\u0013(/Y=\n\u0005=r\u0012a\u00017pOB\u0011q\u0007P\u0007\u0002q)\u0011\u0011HO\u0001\u0006g24GG\u001b\u0006\u0002w\u0005\u0019qN]4\n\u0005uB$A\u0002'pO\u001e,'/\u0001\u0004=S:LGO\u0010\u000b\u0005\u0001\n\u001bE\t\u0005\u0002B\u00015\tA\u0002C\u0003!\t\u0001\u0007\u0011\u0005C\u00030\t\u0001\u0007\u0001\u0007C\u00036\t\u0001\u0007a'A\u0002sk:$\u0012a\u0012\t\u0003c!K!!S\u0014\u0003\tUs\u0017\u000e^\u0001\u0011GJ,\u0017\r^3Qk\nlW\r\u001a#v[B$Ra\u0012'X3nCQ!\u0014\u0004A\u00029\u000bQa\u001d9be.\u0004\"aT+\u000e\u0003AS!!\u0015*\u0002\u0007M\fHN\u0003\u0002N'*\u0011AKO\u0001\u0007CB\f7\r[3\n\u0005Y\u0003&\u0001D*qCJ\\7+Z:tS>t\u0007\"\u0002-\u0007\u0001\u0004\t\u0013AC:pkJ\u001cW\rU1uQ\")!L\u0002a\u0001C\u0005QA/\u0019:hKR\u0004\u0016\r\u001e5\t\u000bq3\u0001\u0019A/\u0002\u0019Y|7-\u00192vY\u0006\u0014\u0018.Z:\u0011\u0005y\u001bW\"A0\u000b\u0005\u0001\f\u0017A\u0003<pG\u0006\u0014W\u000f\\1ss*\u0011!ME\u0001\u0007G>lWn\u001c8\n\u0005\u0011|&a\u0004,pG\u0006\u0014W\u000f\\1ss\u001e\u0013x.\u001e9\u0002+M\u0003\u0018M]6De\u0016\fG/\u001a)vE6,G\rR;naB\u0011\u0011\tC\n\u0003\u0011!\u0004\"!M5\n\u0005)<#AB!osJ+g\rF\u0001g\u0003\u0011i\u0017-\u001b8\u0015\u0005\u001ds\u0007\"B\u0018\u000b\u0001\u0004\u0001\u0004")
public class SparkCreatePubmedDump
extends AbstractScalaApplication {
    private final Logger log;

    public static void main(String[] args) {
        SparkCreatePubmedDump$.MODULE$.main(args);
    }

    public void run() {
        String isLookupUrl = this.parser().get("isLookupUrl");
        this.log.info("isLookupUrl: {}", (Object)isLookupUrl);
        String sourcePath = this.parser().get("sourcePath");
        this.log.info("SourcePath is '" + sourcePath + "'");
        String mdstoreOutputVersion = this.parser().get("mdstoreOutputVersion");
        this.log.info("mdstoreOutputVersion is '" + mdstoreOutputVersion + "'");
        ObjectMapper mapper = new ObjectMapper();
        MDStoreVersion cleanedMdStoreVersion = (MDStoreVersion)mapper.readValue(mdstoreOutputVersion, MDStoreVersion.class);
        String outputBasePath = cleanedMdStoreVersion.getHdfsPath();
        this.log.info("outputBasePath is '" + outputBasePath + "'");
        ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService((String)isLookupUrl);
        VocabularyGroup vocabularies = VocabularyGroup.loadVocsFromIS((ISLookUpService)isLookupService);
        this.createPubmedDump(this.spark(), sourcePath, outputBasePath, vocabularies);
    }

    public void createPubmedDump(SparkSession spark, String sourcePath, String targetPath, VocabularyGroup vocabularies) {
        Predef$.MODULE$.require(spark != null);
        Encoder PMEncoder = Encoders$.MODULE$.bean(PMArticle.class);
        Dataset df = spark.read().option("lineSep", "</PubmedArticle>").text(sourcePath);
        ObjectMapper mapper = new ObjectMapper();
        df.as(spark.implicits().newStringEncoder()).map((Function1 & Serializable)s -> {
            int id = s.indexOf("<PubmedArticle>");
            if (id >= 0) {
                return s.substring(id) + "</PubmedArticle>";
            }
            return null;
        }, spark.implicits().newStringEncoder()).filter((Function1 & Serializable)s -> BoxesRunTime.boxToBoolean((boolean)SparkCreatePubmedDump.$anonfun$createPubmedDump$2(s))).map((Function1 & Serializable)i -> {
            PMArticle pMArticle;
            try {
                pMArticle = new PMParser2().parse((String)i);
            }
            catch (Exception exception) {
                throw new RuntimeException("Error parsing article: " + i);
            }
            return pMArticle;
        }, PMEncoder).dropDuplicates("pmid", (Seq)Nil$.MODULE$).map((Function1 & Serializable)a -> {
            Oaf oaf = PubMedToOaf$.MODULE$.convert((PMArticle)a, vocabularies);
            if (oaf != null) {
                return mapper.writeValueAsString((Object)oaf);
            }
            return null;
        }, spark.implicits().newStringEncoder()).as(spark.implicits().newStringEncoder()).filter((Function1 & Serializable)s -> BoxesRunTime.boxToBoolean((boolean)SparkCreatePubmedDump.$anonfun$createPubmedDump$5(s))).write().option("compression", "gzip").mode("overwrite").text(targetPath + "/store");
        long mdStoreSize = spark.read().text(targetPath + "/store").count();
        DHPUtils.writeHdfsFile((Configuration)spark.sparkContext().hadoopConfiguration(), (String)String.valueOf(BoxesRunTime.boxToLong((long)mdStoreSize)), (String)(targetPath + "/size"));
    }

    public static final /* synthetic */ boolean $anonfun$createPubmedDump$2(String s) {
        return s != null;
    }

    public static final /* synthetic */ boolean $anonfun$createPubmedDump$5(String s) {
        return s != null;
    }

    public SparkCreatePubmedDump(String propertyPath, String[] args, Logger log) {
        this.log = log;
        super(propertyPath, args, log);
    }
}

