package eu.dnetlib.data.mapreduce.hbase.dedup.experiment;

import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.model.Person;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;

/* loaded from: input_file:eu/dnetlib/data/mapreduce/hbase/dedup/experiment/JoinPersonGroupMapper.class */
public class JoinPersonGroupMapper extends Mapper<Text, Text, Text, Text> {
    public static final String PERSON = "person";
    private static final int MAX_TOKENS = 5;
    private static final int MIN_FEATURES = 10;
    private Text outKey;
    private Text outValue;
    private SubjectParser sp;

    protected void setup(Mapper<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {
        this.outKey = new Text();
        this.outValue = new Text();
        this.sp = new SubjectParser();
    }

    protected void map(Text text, Text text2, Mapper<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {
        try {
            Document read = new SAXReader().read(new StringReader(text2.toString()));
            SubjectsMap parse = this.sp.parse(read);
            CsvEntry csvEntry = new CsvEntry();
            Iterator<Subjects> it = parse.values().iterator();
            while (it.hasNext()) {
                Iterator<String> it2 = it.next().iterator();
                while (it2.hasNext()) {
                    csvEntry.addFeature(it2.next());
                }
            }
            List selectNodes = read.selectNodes("//*[local-name() = 'creator']");
            ArrayList<Person> newArrayList = Lists.newArrayList();
            for (int i = 0; i < selectNodes.size(); i++) {
                newArrayList.add(new Person(((Element) selectNodes.get(i)).getText(), false));
            }
            for (Person person : newArrayList) {
                context.getCounter(PERSON, "accurate " + person.isAccurate()).increment(1L);
                Set<String> outKeys = getOutKeys(person);
                context.getCounter(PERSON, String.format("accurate %s keys", Boolean.valueOf(person.isAccurate()))).increment(outKeys.size());
                for (String str : outKeys) {
                    CsvEntry csvEntry2 = new CsvEntry(str, csvEntry.getFeatures());
                    for (Person person2 : newArrayList) {
                        String normalize = normalize(person2.getSurnameString());
                        if (person.isAccurate() && person2.isAccurate() && !person.getSurnameString().equalsIgnoreCase(person2.getSurnameString())) {
                            csvEntry2.addFeature(normalize);
                        }
                    }
                    csvEntry2.getFeatures().remove(str);
                    if (str.length() <= 3) {
                        context.getCounter(PERSON, "key size <= 3").increment(1L);
                        return;
                    } else if (csvEntry2.getFeatures().size() < MIN_FEATURES) {
                        context.getCounter(PERSON, "features < 10").increment(1L);
                        return;
                    } else {
                        this.outKey.set(str);
                        this.outValue.set(csvEntry2.toString());
                        context.write(this.outKey, this.outValue);
                    }
                }
            }
        } catch (Throwable th) {
            System.out.println("GOT EX " + th);
            th.printStackTrace(System.err);
            context.getCounter(PERSON, th.getClass().toString()).increment(1L);
        }
    }

    private Set<String> getOutKeys(Person person) {
        HashSet newHashSet = Sets.newHashSet();
        if (person.isAccurate()) {
            newHashSet.add(normalize(person));
        } else {
            String normalize = normalize(person.getOriginal());
            for (String str : tokens(normalize)) {
                for (String str2 : tokens(normalize)) {
                    if (!str.equals(str2)) {
                        newHashSet.add(firstLC(str) + str2);
                    }
                }
            }
        }
        return newHashSet;
    }

    private String normalize(Person person) {
        return normalize(person.getSurnameString() + firstLC(person.getNameString()));
    }

    private String normalize(String str) {
        return str.replaceAll("[^a-zA-Z ]", "").toLowerCase().trim();
    }

    private Iterable<String> tokens(String str) {
        return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(str), MAX_TOKENS);
    }

    private String firstLC(String str) {
        return StringUtils.substring(str, 0, 1).toLowerCase();
    }

    protected /* bridge */ /* synthetic */ void map(Object obj, Object obj2, Mapper.Context context) throws IOException, InterruptedException {
        map((Text) obj, (Text) obj2, (Mapper<Text, Text, Text, Text>.Context) context);
    }
}
