package eu.dnetlib.iis.ingest.pmc.metadata;

import eu.dnetlib.iis.common.affiliation.AffiliationBuilder;
import eu.dnetlib.iis.ingest.pmc.metadata.schemas.ExtractedDocumentMetadata;
import eu.dnetlib.iis.ingest.pmc.metadata.schemas.Range;
import eu.dnetlib.iis.ingest.pmc.metadata.schemas.ReferenceBasicMetadata;
import eu.dnetlib.iis.ingest.pmc.metadata.schemas.ReferenceMetadata;
import eu.dnetlib.iis.metadataextraction.schemas.Affiliation;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Stack;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.jdom.Element;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import pl.edu.icm.cermine.metadata.affiliation.CRFAffiliationParser;

/* loaded from: input_file:eu/dnetlib/iis/ingest/pmc/metadata/PmcXmlHandler.class */
public class PmcXmlHandler extends DefaultHandler {
    private static final String ELEM_JOURNAL_TITLE = "journal-title";
    private static final String ELEM_JOURNAL_TITLE_GROUP = "journal-title-group";
    private static final String ELEM_ARTICLE_META = "article-meta";
    private static final String ELEM_ARTICLE_ID = "article-id";
    private static final String ELEM_AFFILIATION = "aff";
    private static final String ELEM_LABEL = "label";
    private static final String ELEM_REF_LIST = "ref-list";
    private static final String ELEM_REF = "ref";
    private static final String ELEM_PUB_ID = "pub-id";
    private static final String ELEM_ARTICLE_TITLE = "article-title";
    private static final String ELEM_SOURCE = "source";
    private static final String ELEM_YEAR = "year";
    private static final String ELEM_VOLUME = "volume";
    private static final String ELEM_ISSUE = "issue";
    private static final String ELEM_FPAGE = "fpage";
    private static final String ELEM_LPAGE = "lpage";
    private static final String ELEM_NAME = "name";
    private static final String ELEM_SURNAME = "surname";
    private static final String ELEM_GIVEN_NAMES = "given-names";
    private static final String ELEM_CITATION = "citation";
    private static final String ELEM_ELEMENT_CITATION = "element-citation";
    private static final String ELEM_MIXED_CITATION = "mixed-citation";
    private static final String PUB_ID_TYPE = "pub-id-type";
    private static final String ATTR_ARTICLE_TYPE = "article-type";
    private static final String PUB_ID_TYPE_PMID = "pmid";
    private Stack<String> parents;
    private ReferenceMetadata.Builder currentRefMetaBuilder;
    private List<CharSequence> currentRefAuthorList;
    private StringBuffer currentReferenceText;
    private final ExtractedDocumentMetadata.Builder builder;
    private final Logger log = Logger.getLogger(getClass());
    private StringBuilder currentValue = new StringBuilder();
    private String currentSurname = null;
    private String currentGivenNames = null;
    private boolean currentReferenceTextExplicitlySet = false;
    private String currentReferenceIdType = null;
    private String currentArticleIdType = null;
    boolean containsTextChild = false;
    boolean rootElement = true;

    public PmcXmlHandler(ExtractedDocumentMetadata.Builder builder) {
        this.builder = builder;
    }

    @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void startDocument() throws SAXException {
        this.parents = new Stack<>();
        clearAllFields();
    }

    @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
        if (this.rootElement) {
            this.rootElement = false;
            String value = attributes.getValue(ATTR_ARTICLE_TYPE);
            if (value != null) {
                this.builder.setEntityType(value);
            } else {
                this.builder.setEntityType("unknown");
            }
        } else if (isWithinElement(str3, ELEM_JOURNAL_TITLE, ELEM_JOURNAL_TITLE_GROUP)) {
            this.currentValue = new StringBuilder();
        } else if (isWithinElement(str3, ELEM_ARTICLE_ID, ELEM_ARTICLE_META)) {
            this.currentArticleIdType = attributes.getValue(PUB_ID_TYPE);
            this.currentValue = new StringBuilder();
        } else if (isWithinElement(str3, ELEM_FPAGE, ELEM_ARTICLE_META) || isWithinElement(str3, ELEM_LPAGE, ELEM_ARTICLE_META)) {
            this.currentValue = new StringBuilder();
        } else if (hasAmongParents(str3, ELEM_AFFILIATION, this.parents, ELEM_ARTICLE_META)) {
            this.currentValue = new StringBuilder();
        } else if (hasAmongParents(str3, ELEM_ARTICLE_TITLE, this.parents, ELEM_REF, ELEM_REF_LIST) || hasAmongParents(str3, ELEM_SOURCE, this.parents, ELEM_REF, ELEM_REF_LIST) || hasAmongParents(str3, ELEM_YEAR, this.parents, ELEM_REF, ELEM_REF_LIST) || hasAmongParents(str3, ELEM_VOLUME, this.parents, ELEM_REF, ELEM_REF_LIST) || hasAmongParents(str3, ELEM_ISSUE, this.parents, ELEM_REF, ELEM_REF_LIST) || hasAmongParents(str3, ELEM_FPAGE, this.parents, ELEM_REF, ELEM_REF_LIST) || hasAmongParents(str3, ELEM_LPAGE, this.parents, ELEM_REF, ELEM_REF_LIST)) {
            this.currentValue = new StringBuilder();
        } else if (isWithinElement(str3, ELEM_SURNAME, ELEM_NAME) || isWithinElement(str3, ELEM_GIVEN_NAMES, ELEM_NAME)) {
            this.currentValue = new StringBuilder();
        } else if (isWithinElement(str3, ELEM_PUB_ID, ELEM_CITATION) || isWithinElement(str3, ELEM_PUB_ID, ELEM_ELEMENT_CITATION) || isWithinElement(str3, ELEM_PUB_ID, ELEM_MIXED_CITATION)) {
            this.currentReferenceIdType = attributes.getValue(PUB_ID_TYPE);
            this.currentValue = new StringBuilder();
        } else if (isWithinElement(str3, ELEM_REF, ELEM_REF_LIST)) {
            this.currentRefMetaBuilder = ReferenceMetadata.newBuilder();
            this.currentRefAuthorList = new ArrayList();
            this.currentReferenceText = new StringBuffer();
            ReferenceBasicMetadata.Builder newBuilder = ReferenceBasicMetadata.newBuilder();
            newBuilder.setExternalIds(new HashMap());
            this.currentRefMetaBuilder.setBasicMetadata(newBuilder.build());
        }
        this.parents.push(str3);
    }

    @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void endElement(String str, String str2, String str3) throws SAXException {
        try {
            this.parents.pop();
            if (isWithinElement(str3, ELEM_JOURNAL_TITLE, ELEM_JOURNAL_TITLE_GROUP)) {
                this.builder.setJournal(this.currentValue.toString().trim());
            } else if (isWithinElement(str3, ELEM_ARTICLE_ID, ELEM_ARTICLE_META) && PUB_ID_TYPE_PMID.equals(this.currentArticleIdType)) {
                this.builder.setPmid(this.currentValue.toString().trim());
            } else if (isWithinElement(str3, ELEM_FPAGE, ELEM_ARTICLE_META)) {
                if (this.builder.getPages() == null) {
                    this.builder.setPages(Range.newBuilder().build());
                }
                this.builder.getPages().setStart(this.currentValue.toString().trim());
            } else if (isWithinElement(str3, ELEM_LPAGE, ELEM_ARTICLE_META)) {
                if (this.builder.getPages() == null) {
                    this.builder.setPages(Range.newBuilder().build());
                }
                this.builder.getPages().setEnd(this.currentValue.toString().trim());
            } else if (hasAmongParents(str3, ELEM_AFFILIATION, this.parents, ELEM_ARTICLE_META)) {
                CRFAffiliationParser cRFAffiliationParser = new CRFAffiliationParser();
                String sb = this.currentValue.toString();
                if (sb.trim().length() > 0) {
                    try {
                        Element parse = cRFAffiliationParser.parse(sb);
                        if (parse != null) {
                            if (this.builder.getAffiliations() == null) {
                                this.builder.setAffiliations(new ArrayList());
                            }
                            Affiliation build = AffiliationBuilder.build(parse);
                            if (build.getRawText().length() > 0) {
                                this.builder.getAffiliations().add(build);
                            } else {
                                build.setRawText(sb);
                            }
                        }
                    } catch (IndexOutOfBoundsException e) {
                        this.log.error("exception occurred when parsing affiliation: " + sb, e);
                    }
                }
            } else if (hasAmongParents(str3, ELEM_ARTICLE_TITLE, this.parents, ELEM_REF, ELEM_REF_LIST)) {
                this.currentRefMetaBuilder.getBasicMetadata().setTitle(this.currentValue.toString());
            } else if (hasAmongParents(str3, ELEM_SOURCE, this.parents, ELEM_REF, ELEM_REF_LIST)) {
                this.currentRefMetaBuilder.getBasicMetadata().setSource(this.currentValue.toString());
            } else if (hasAmongParents(str3, ELEM_YEAR, this.parents, ELEM_REF, ELEM_REF_LIST)) {
                this.currentRefMetaBuilder.getBasicMetadata().setYear(this.currentValue.toString());
            } else if (hasAmongParents(str3, ELEM_VOLUME, this.parents, ELEM_REF, ELEM_REF_LIST)) {
                this.currentRefMetaBuilder.getBasicMetadata().setVolume(this.currentValue.toString());
            } else if (hasAmongParents(str3, ELEM_ISSUE, this.parents, ELEM_REF, ELEM_REF_LIST)) {
                this.currentRefMetaBuilder.getBasicMetadata().setIssue(this.currentValue.toString());
            } else if (hasAmongParents(str3, ELEM_FPAGE, this.parents, ELEM_REF, ELEM_REF_LIST)) {
                if (this.currentRefMetaBuilder.getBasicMetadata().getPages() == null) {
                    this.currentRefMetaBuilder.getBasicMetadata().setPages(Range.newBuilder().build());
                }
                this.currentRefMetaBuilder.getBasicMetadata().getPages().setStart(this.currentValue.toString());
            } else if (hasAmongParents(str3, ELEM_LPAGE, this.parents, ELEM_REF, ELEM_REF_LIST)) {
                if (this.currentRefMetaBuilder.getBasicMetadata().getPages() == null) {
                    this.currentRefMetaBuilder.getBasicMetadata().setPages(Range.newBuilder().build());
                }
                this.currentRefMetaBuilder.getBasicMetadata().getPages().setEnd(this.currentValue.toString());
            } else if (hasAmongParents(str3, ELEM_PUB_ID, this.parents, ELEM_REF, ELEM_REF_LIST)) {
                if (this.currentReferenceIdType != null) {
                    this.currentRefMetaBuilder.getBasicMetadata().getExternalIds().put(this.currentReferenceIdType, this.currentValue.toString());
                }
            } else if (isWithinElement(str3, ELEM_SURNAME, ELEM_NAME)) {
                this.currentSurname = this.currentValue.toString();
            } else if (isWithinElement(str3, ELEM_GIVEN_NAMES, ELEM_NAME)) {
                this.currentGivenNames = this.currentValue.toString();
            } else if (hasAmongParents(str3, ELEM_NAME, this.parents, ELEM_REF)) {
                this.currentRefAuthorList.add(this.currentSurname + ", " + this.currentGivenNames);
                this.currentSurname = null;
                this.currentGivenNames = null;
            } else if (isWithinElement(str3, ELEM_CITATION, ELEM_REF) || isWithinElement(str3, ELEM_ELEMENT_CITATION, ELEM_REF) || isWithinElement(str3, ELEM_MIXED_CITATION, ELEM_REF)) {
                if (!this.currentRefMetaBuilder.hasText() && this.currentReferenceTextExplicitlySet && this.currentReferenceText != null && this.currentReferenceText.length() > 0) {
                    String replaceAll = this.currentReferenceText.toString().trim().replaceAll(" +", " ");
                    if (!replaceAll.isEmpty()) {
                        this.currentRefMetaBuilder.setText(replaceAll);
                    }
                }
            } else if (isWithinElement(str3, ELEM_REF, ELEM_REF_LIST)) {
                if (this.builder.getReferences() == null) {
                    this.builder.setReferences(new ArrayList());
                }
                this.currentRefMetaBuilder.setPosition(Integer.valueOf(this.builder.getReferences().size() + 1));
                if (this.currentRefAuthorList != null && this.currentRefAuthorList.size() > 0) {
                    this.currentRefMetaBuilder.getBasicMetadata().setAuthors(this.currentRefAuthorList);
                }
                if (!this.currentRefMetaBuilder.hasText()) {
                    this.currentRefMetaBuilder.setText(generateReferenceRawText(this.currentRefMetaBuilder.getBasicMetadata()));
                }
                this.builder.getReferences().add(this.currentRefMetaBuilder.build());
                this.currentRefMetaBuilder = null;
                this.currentRefAuthorList = null;
                this.currentReferenceText = null;
                this.currentReferenceTextExplicitlySet = false;
                this.currentReferenceIdType = null;
            }
        } catch (Exception e2) {
            throw new RuntimeException("unexpected exception while processing doc: " + ((Object) this.builder.getId()), e2);
        }
    }

    @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void endDocument() throws SAXException {
        this.parents.clear();
        this.parents = null;
    }

    @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void characters(char[] cArr, int i, int i2) throws SAXException {
        String pop = this.parents.pop();
        try {
            if (isWithinElement(pop, ELEM_LABEL, ELEM_AFFILIATION)) {
                return;
            }
            this.currentValue.append(cArr, i, i2);
            if (hasAmongParents(this.parents, ELEM_REF)) {
                if (isWithinElement(pop, ELEM_CITATION, ELEM_REF) || isWithinElement(pop, ELEM_ELEMENT_CITATION, ELEM_REF) || isWithinElement(pop, ELEM_MIXED_CITATION, ELEM_REF)) {
                    char[] cArr2 = new char[i2];
                    System.arraycopy(cArr, i, cArr2, 0, i2);
                    if (containsNonWhiteCharacter(cArr2)) {
                        this.currentReferenceTextExplicitlySet = true;
                    }
                }
                if (this.currentReferenceText.length() > 0 && isAlphanumeric(cArr[i]) && isAlphanumeric(this.currentReferenceText.charAt(this.currentReferenceText.length() - 1))) {
                    this.currentReferenceText.append(' ');
                }
                this.currentReferenceText.append(cArr, i, i2);
            }
            this.parents.push(pop);
        } finally {
            this.parents.push(pop);
        }
    }

    private void clearAllFields() {
        this.currentArticleIdType = null;
        this.rootElement = true;
    }

    static boolean isAlphanumeric(char c) {
        return c >= '0' && (c < ':' || c > '@') && ((c <= 'Z' || c > '`') && c <= 'z');
    }

    boolean isWithinElement(String str, String str2, String str3) {
        return str.equals(str2) && (str3 == null || (!this.parents.isEmpty() && str3.equals(this.parents.peek())));
    }

    public static boolean hasAmongParents(String str, String str2, Stack<String> stack, String... strArr) {
        if (str.equals(str2)) {
            return hasAmongParents(stack, strArr);
        }
        return false;
    }

    public static boolean hasAmongParents(Stack<String> stack, String... strArr) {
        if (strArr.length > stack.size()) {
            return false;
        }
        int i = 0;
        for (String str : strArr) {
            boolean z = false;
            int i2 = i;
            while (true) {
                if (i2 >= stack.size()) {
                    break;
                }
                if (str.equals(stack.get(stack.size() - (i2 + 1)))) {
                    i = i2 + 1;
                    z = true;
                    break;
                }
                i2++;
            }
            if (!z) {
                return false;
            }
        }
        return true;
    }

    static boolean containsNonWhiteCharacter(char[] cArr) {
        if (cArr == null || cArr.length <= 0) {
            return false;
        }
        for (char c : cArr) {
            if (!Character.isWhitespace(c)) {
                return true;
            }
        }
        return false;
    }

    public static String generateReferenceRawText(ReferenceBasicMetadata referenceBasicMetadata) {
        String join = referenceBasicMetadata.getAuthors() != null ? StringUtils.join(referenceBasicMetadata.getAuthors(), ", ") : "";
        String charSequence = referenceBasicMetadata.getTitle() != null ? referenceBasicMetadata.getTitle().toString() : null;
        String charSequence2 = referenceBasicMetadata.getSource() != null ? referenceBasicMetadata.getSource().toString() : null;
        String charSequence3 = referenceBasicMetadata.getYear() != null ? referenceBasicMetadata.getYear().toString() : null;
        String charSequence4 = referenceBasicMetadata.getVolume() != null ? referenceBasicMetadata.getVolume().toString() : null;
        String charSequence5 = referenceBasicMetadata.getIssue() != null ? referenceBasicMetadata.getIssue().toString() : null;
        String charSequence6 = (referenceBasicMetadata.getPages() == null || referenceBasicMetadata.getPages().getStart() == null) ? null : referenceBasicMetadata.getPages().getStart().toString();
        String charSequence7 = (referenceBasicMetadata.getPages() == null || referenceBasicMetadata.getPages().getEnd() == null) ? null : referenceBasicMetadata.getPages().getEnd().toString();
        StringBuilder sb = new StringBuilder();
        if (StringUtils.isNotBlank(join)) {
            sb.append(join);
            sb.append(". ");
        }
        if (StringUtils.isNotBlank(charSequence)) {
            sb.append(charSequence);
            sb.append(". ");
        }
        if (StringUtils.isNotBlank(charSequence2)) {
            sb.append(charSequence2);
            sb.append(". ");
        }
        if (StringUtils.isNotBlank(charSequence3)) {
            sb.append(charSequence3);
        }
        if (StringUtils.isNotBlank(charSequence4)) {
            sb.append("; ");
            sb.append(charSequence4);
        }
        if (StringUtils.isNotBlank(charSequence5)) {
            sb.append(" (");
            sb.append(charSequence5);
            sb.append(")");
        }
        if (StringUtils.isNotBlank(charSequence6)) {
            sb.append(": ");
            sb.append(charSequence6);
        }
        if (StringUtils.isNotBlank(charSequence7)) {
            sb.append("-");
            sb.append(charSequence7);
        }
        return sb.toString();
    }
}
