dnet-hadoop/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/authorpids/MakeReportSparkJob.java


package eu.dnetlib.dhp.oa.graph.clean.authorpids;

import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;

import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.*;
import java.util.stream.Collectors;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.CosineDistance;
import org.apache.commons.text.similarity.LevenshteinDistance;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.util.LongAccumulator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Maps;
import com.google.gson.Gson;
import com.intuit.fuzzymatcher.component.MatchService;
import com.intuit.fuzzymatcher.domain.Document;
import com.intuit.fuzzymatcher.domain.Element;
import com.intuit.fuzzymatcher.domain.ElementType;

import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.common.PacePerson;
import eu.dnetlib.dhp.oa.graph.clean.authorpids.constraints.ConstraintResolver;
import eu.dnetlib.dhp.oa.graph.clean.authorpids.constraints.ConstraintResolverFactory;
import eu.dnetlib.dhp.oa.graph.clean.authorpids.constraints.Selection;
import eu.dnetlib.dhp.schema.oaf.Result;
import me.xdrop.fuzzywuzzy.FuzzySearch;
import py4j.StringUtil;
import scala.Tuple2;

/**
 * It checks if the orcid provided by ORCID and the one found in the result have the same author information. The
 * author information is handled before the checking. Handling steps:
 * words are lower-cased and trimmed, accents are replaced with their equivalent not accented. Only alfabethical
 * characters and white space are retained. All the other chars are substituted with space.
 *
 * The check is made on different specification levels:
 *
 * Level1: orcid author surname and result author surname are identical. We consider the match to be right
 *
 * Level2: we verify if orcid author surname contains result author surname or vice versa. If it is the case we consider
 * the match to be right
 *
 * Level3: we verify if one of the two surnames is composed by two words. In that case we concatenate the words and do
 * the checking again. If the two match, we consider the match to be checked
 *
 * Level4: name and surname can be inverted in one of the two entities. We consider the set of words composing the name
 * and the surname that are longer than 2 for orcid and result. If all the words of the shorter list are contained in
 * the longer one, we consider the match to be checked
 *
 * Level5: name and surname are inverted but one of the two is composed by two words. Mix of Level3 and level4. We consider
 * the match to be checked
 *
 * Level6: surnames differ for some chars. We apply the levenstein distance on surnames if their lenght is bigger than 3.
 * If the distance is less than 2 we consider the match to be checked
 *
 * In all the other cases the match is considered wrong
 *
 */

public class MakeReportSparkJob implements Serializable {
	private static final Logger log = LoggerFactory.getLogger(MakeReportSparkJob.class);

	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();

	public static void main(String[] args) throws Exception {

		String jsonConfiguration = IOUtils
			.toString(
				MakeReportSparkJob.class
					.getResourceAsStream(
						"/eu/dnetlib/dhp/oa/graph/clean_orcid/input_report_author_pids.json"));
		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
		parser.parseArgument(args);

		Boolean isSparkSessionManaged = Optional
			.ofNullable(parser.get("isSparkSessionManaged"))
			.map(Boolean::valueOf)
			.orElse(Boolean.TRUE);
		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);

		String outputPath = parser.get("outputPath");
		log.info("outputPath: {}", outputPath);

		String preparedInfoPath = parser.get("preparedInfoPath");
		log.info("preparedInfoPath: {} ", preparedInfoPath);

		String orcidInputPath = parser.get("orcidInputPath");
		log.info("orcidInputPath: {}", orcidInputPath);

		String whiteListString = parser.get("whitelist");
		log.info("whitelist: {}", whiteListString);

		SparkConf conf = new SparkConf();
		runWithSparkHiveSession(
			conf,
			isSparkSessionManaged,
			spark -> {
				removeOutputDir(spark, outputPath);

				makeReport(
					spark, outputPath, preparedInfoPath,
					loadAuthoritativeOrcid(spark, orcidInputPath), whiteListString);
			});
	}

	/**
	 * Loads the sequence file containing the information about the orcid id, name and surname of the author.
	 * It returns a dataset whose model maps the one defined in the class OrcidAuthoritative
	 * @param spark the sparkSession
	 * @param orcidInputPath the path where to read the sequence file
	 * @return the dataset
	 */
	private static Dataset<OrcidAuthotitative> loadAuthoritativeOrcid(SparkSession spark, String orcidInputPath) {

		JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());

		return spark
			.createDataset(
				JavaRDD
					.toRDD(
						sc
							.sequenceFile(orcidInputPath, Text.class, Text.class)
							.map(pair -> OBJECT_MAPPER.readValue(pair._2().toString(), OrcidAuthotitative.class))),
				Encoders.bean(OrcidAuthotitative.class));

	}

	private static void addInList(List<String> list, String to_add) {
		for (String word : to_add.split(" ")) {
			if (word.length() > 2) {
				list.add(word);
			}
		}

	}

	public static String handleNameSurname(String input) {
		input = input.toLowerCase().replace(".", "");
		if (input.startsWith("dr")) {
			if (input.length() > 3)
				input = input.substring(3);
			else
				return "";
		}

		return StringUtils
			.stripAccents(input.trim())
			.replaceAll("[^a-z\\s]+", " ")
			.trim();
	}

	private static <I extends Result> void makeReport(SparkSession spark,
		String outputPath, String preparedInfoPath,
		Dataset<OrcidAuthotitative> authoritative, String whiteliststring) {

		WhiteList whitelist = new Gson().fromJson(whiteliststring, WhiteList.class);
		log.info("whitelist_size: {}", whitelist.getWhitelist().size());

		ConstraintResolver resolver = ConstraintResolverFactory.newInstance();

		whitelist.getWhitelist().forEach(constraint -> {
			try {
				constraint.setSelection(resolver);
			} catch (NoSuchMethodException e) {
				e.printStackTrace();
			} catch (InstantiationException e) {
				e.printStackTrace();
			} catch (IllegalAccessException e) {
				e.printStackTrace();
			} catch (InvocationTargetException e) {
				e.printStackTrace();
			}
		});

		Dataset<ResultInfo> resultInfo = readPath(spark, preparedInfoPath, ResultInfo.class);

		Dataset<Tuple2<String, ReportInfo>> checkedResult = resultInfo
			.joinWith(
				authoritative, authoritative
					.col("oid")
					.equalTo(resultInfo.col("orcid")),
				"left")
			.map((MapFunction<Tuple2<ResultInfo, OrcidAuthotitative>, Tuple2<String, ReportInfo>>) pair -> {
				return getStringReportInfoFuzzyTuple2(pair, whitelist);

			}, Encoders.tuple(Encoders.STRING(), Encoders.bean(ReportInfo.class)))
			.filter(Objects::nonNull);

		writeSet(
			checkedResult.filter((FilterFunction<Tuple2<String, ReportInfo>>) result -> result._1().equals("wrong")),
			outputPath + "/wrong");
		writeSet(
			checkedResult.filter((FilterFunction<Tuple2<String, ReportInfo>>) result -> result._1().equals("right")),
			outputPath + "/right");
		writeSet(
			checkedResult.filter((FilterFunction<Tuple2<String, ReportInfo>>) result -> result._1().equals("check")),
			outputPath + "/check");
		writeSet(
			checkedResult.filter((FilterFunction<Tuple2<String, ReportInfo>>) result -> result._1().equals("missing")),
			outputPath + "/missing");
		writeSet(
			checkedResult
				.filter((FilterFunction<Tuple2<String, ReportInfo>>) result -> result._1().equals("constraint")),
			outputPath + "/constraint");
	}

	private static double fuzzyMatch(String orcid, String result) {
		// apply one or more fuzzy functions to determine if the input string match
		// match 1.0 con fuzzy => giusti
		// quelli che matchano sopra 0.66 con fuzzy li metto fra i giusti
		// quelli che non stanno nel match di prima, ma matchano fuzzywizzy sopra 0.5 li metto in check
		// probabilmente giusti
		// quelli che matchano fuzzywizzy da 0.5 a 0.3
		return 0;

	}

	public static Tuple2<String, ReportInfo> getStringReportInfoFuzzyTuple2(
		Tuple2<ResultInfo, OrcidAuthotitative> pair, WhiteList whiteList) {

		Optional<OrcidAuthotitative> ooa = Optional.ofNullable(pair._2());
		if (!ooa.isPresent()) {
			return null;
		}

		OrcidAuthotitative oa = ooa.get();

		ResultInfo ri = pair._1();

		if (StringUtils.isBlank(ri.getSurname())) {
			PacePerson pp = new PacePerson(ri.getFullname(), false);
			ri.setSurname(pp.getNormalisedSurname());
			ri.setName(pp.getNormalisedFirstName());
		}
		ReportInfo reportInfo = new ReportInfo();
		reportInfo.setOid(oa.getOid());
		reportInfo.setOname(oa.getName());
		reportInfo.setOsurname(oa.getSurname());
		reportInfo.setOcreditname(oa.getCreditName());
		reportInfo.setAlternativeNames(oa.getOtherNames());
		reportInfo.setAssociatedAuthors(Arrays.asList(ri));

		if (!Optional.ofNullable(oa.getSurname()).isPresent()) {

			return new Tuple2<>("missing", reportInfo);
		}
		final String handledOsurname = handleNameSurname(oa.getSurname());

		if (handledOsurname.equalsIgnoreCase("")) {

			return new Tuple2<>("missing", reportInfo);
		}
		final String handledSurname = handleNameSurname(ri.getSurname());

		if (handledSurname.equals("")) {

			return new Tuple2<>("missing", reportInfo);
		}

		final String handledOname = Optional
			.ofNullable(oa.getName())
			.map(name -> handleNameSurname(name))
			.orElse("");

		final String handledName = Optional
			.ofNullable(ri.getName())
			.map(name -> handleNameSurname(name))
			.orElse("");

		if (verifyConstraints(new HashMap<String, String>() {
			{
				put("oname", handledOname);
				put("osurname", handledOsurname);
				put("name", handledName);
				put("surname", handledSurname);
			}
		}, whiteList)) {

			return new Tuple2<>("constraint", reportInfo);
		}
		String[][] input = {
			{
				"1", handledOsurname + " " + handledOname
			},
			{
				"2", handledSurname + " " + handledName
			}
		};
		// exact match word by word
		Double cosineDistance = new CosineDistance().apply(input[0][1], input[1][1]);
		if (Math.round((1 - cosineDistance) * 100) == 100) {
			reportInfo.setLevel("cosine similarity equals to 1");
			return new Tuple2<>("right", reportInfo);
		}
		// check if there is neither a common word. If there is not they could be wrong
		if (Math.round((1 - new CosineDistance().apply(input[0][1], input[1][1])) * 100) == 0) {
			// verify if there is another name that can be used by the author
			if (StringUtils.isNotEmpty(oa.getCreditName())) {
				try {
					if (Math
						.round(
							1 - new CosineDistance().apply(input[1][1], handleNameSurname(oa.getCreditName()))) > 0) {
						reportInfo.setLevel("not zero cosine on credit names");
						return new Tuple2<>("check", reportInfo);
					}
				} catch (Exception e) {
					reportInfo.setLevel(e.getMessage() + " " + oa.getCreditName());
					return new Tuple2<>("check", reportInfo);
				}

			}
			if (oa.getOtherNames().size() > 0) {
				for (String othername : oa.getOtherNames()) {
					if (StringUtils.isNotEmpty(othername))
						try {
							if (Math
								.round(1 - new CosineDistance().apply(input[1][1], handleNameSurname(othername))) > 0) {
								reportInfo.setLevel("not zero cosine on othen names");
								return new Tuple2<>("check", reportInfo);
							}
						} catch (Exception e) {
							reportInfo.setLevel(e.getMessage() + " " + othername);
							return new Tuple2<>("check", reportInfo);
						}

				}
			}
			MatchService matchService = new MatchService();

			List<Document> documentList = Arrays.asList(input).stream().map(contact -> {
				return new Document.Builder(contact[0])
					.addElement(
						new Element.Builder<String>()
							.setValue(contact[1])
							.setType(ElementType.NAME)
							.createElement())
					.createDocument();
			}).collect(Collectors.toList());
			if (matchService.applyMatchByDocId(documentList).entrySet().size() == 0) {
				double out = FuzzySearch.ratio(input[0][1], input[1][1]);
				if (out < 29) {
					reportInfo.setLevel("less than 29 in fuzzywuzzy");
					return new Tuple2<>("wrong", reportInfo);
				} else {
					// TODO extend the checking to catch range of fuzzy wuzzy that could be wrong
					// try using soundex techniques or merge with previous implementation or both
					reportInfo.setLevel("more than 29 in fuzzywuzzy");
					return new Tuple2<>("check", reportInfo);
				}

			}
			// TODO match size is not equal to zero. Verify the match value and then decide how to preceed
		} else {
//			MatchService matchService = new MatchService();
//
//			List<Document> documentList = Arrays.asList(input).stream().map(contact -> {
//				return new Document.Builder(contact[0])
//					.addElement(
//						new Element.Builder<String>()
//							.setValue(contact[1])
//							.setType(ElementType.NAME)
//							.createElement())
//					.createDocument();
//			}).collect(Collectors.toList());
//			if (matchService.applyMatchByDocId(documentList).entrySet().size() == 1) {
			if (FuzzySearch.ratio(input[0][1], input[1][1]) > 90) {
				reportInfo.setLevel("more than 90 in fuzzywuzzy");
				return new Tuple2<>("right", reportInfo);
			} else {
				reportInfo.setLevel("less than 90 in fuzzywuzzy");
				return new Tuple2<>("check", reportInfo);
			}

//			}else{
//				reportInfo.setLevel("not found a match in name matching");
//				return new Tuple2<>("check", reportInfo);
//			}
		}

//		// they have some words in common. check if orcid provides creditName or otherNames to check for distance
//		//
//		List<Document> documentList = Arrays.asList(input).stream().map(contact -> {
//			return new Document.Builder(contact[0])
//				.addElement(
//					new Element.Builder<String>()
//						.setValue(contact[1])
//						.setType(ElementType.NAME)
//						.createElement())
//				.createDocument();
//		}).collect(Collectors.toList());
//
//		MatchService matchService = new MatchService();
//
//		Map<String, List<Match<Document>>> result = matchService.applyMatchByDocId(documentList);
//
//		if (result.entrySet().size() > 0) {
//			reportInfo.setLevel("fuzzyMatch");
//			return new Tuple2<>("right", reportInfo);
//		}
		return new Tuple2<>("right", reportInfo);

	}

	// constraints in or
	private static boolean verifyConstraints(Map<String, String> param, WhiteList whitelist) {
		log.info("whitelist_size : {}", whitelist.getWhitelist().size());
		for (SelectionConstraints constraint : whitelist.getWhitelist()) {
			if (constraint.verifyCriteria(param)) {
				return true;
			}
		}
		return false;
	}

	public static Tuple2<String, ReportInfo> getStringReportInfoTuple2(Tuple2<ResultInfo, OrcidAuthotitative> pair) {
		Optional<OrcidAuthotitative> ooa = Optional.ofNullable(pair._2());
		if (!ooa.isPresent()) {
			return null;
		}
		OrcidAuthotitative oa = ooa.get();

		ResultInfo ri = pair._1();

		if (StringUtils.isBlank(ri.getSurname())) {
			PacePerson pp = new PacePerson(ri.getFullname(), false);
			ri.setSurname(pp.getNormalisedSurname());
			ri.setName(pp.getNormalisedFirstName());
		}
		ReportInfo reportInfo = new ReportInfo();
		reportInfo.setOid(oa.getOid());
		reportInfo.setOname(oa.getName());
		reportInfo.setOsurname(oa.getSurname());
		reportInfo.setOcreditname(oa.getCreditName());
		reportInfo.setAssociatedAuthors(Arrays.asList(ri));

		int level = 1;

		if (!Optional.ofNullable(oa.getSurname()).isPresent()) {
			return new Tuple2<>("missing", reportInfo);
		}
		final String handledOsurname = handleNameSurname(oa.getSurname());

		if (handledOsurname.equalsIgnoreCase("")) {
			return new Tuple2<>("missing", reportInfo);
		}
		final String handledSurname = handleNameSurname(ri.getSurname());

		if (handledSurname.equals("")) {
			return new Tuple2<>("missing", reportInfo);
		}

		// check if oSurname and surname are equals
		if (handledOsurname.equals(handledSurname)) {
			reportInfo.setLevel("level" + level);
			return new Tuple2<>("right", reportInfo);
		}
		level++;

		// check if one is contained in the other
		if (handledOsurname.contains(handledSurname) || handledSurname.contains(handledOsurname)) {
			reportInfo.setLevel("level" + level);
			return new Tuple2<>("right", reportInfo);
		}
		level++;
		// check if one of the two is composed of more than one word. In this case concatenate the two words
		// and check again (Mohammadi Peyhani vs Mohammadipeyhani)
		String[] handledorcidSplit = handledOsurname.split(" ");
		String[] handledresultSplit = handledSurname.split(" ");
		if (handledorcidSplit.length == 2) {
			String tmpSurname = handledorcidSplit[0] + handledorcidSplit[1];
			if (tmpSurname.equals(handledSurname)) {
				reportInfo.setLevel("level" + level);
				return new Tuple2<>("check", reportInfo);
			}
		}
		if (handledresultSplit.length == 2) {
			String tmpSurname = handledresultSplit[0] + handledresultSplit[1];
			if (tmpSurname.equals(handledSurname)) {
				reportInfo.setLevel("level" + level);
				return new Tuple2<>("check", reportInfo);
			}
		}
		level++;
		// check if the words composing the name and the surname are the same or one list contains the
		// other.
		// do for words of lenght bigger than two
		String handledOname = "";
		if (Optional.ofNullable(oa.getName()).isPresent()) {
			handledOname = handleNameSurname(oa.getName());
		}
		String handledName = "";
		if (Optional.ofNullable(ri.getName()).isPresent()) {
			handledName = handleNameSurname(ri.getName());
		}

		final List<String> orcidList = new ArrayList<>();
		final List<String> paperList = new ArrayList<>();

		addInList(orcidList, handledOname);
		addInList(orcidList, handledOsurname);

		addInList(paperList, handledSurname);
		addInList(paperList, handledName);

		if (checkListContainment(reportInfo, level, orcidList, paperList))
			return new Tuple2<>("check", reportInfo);
		level++;

		handledorcidSplit = handledOsurname.split(" ");
		handledresultSplit = handledName.split(" ");

		if (handledorcidSplit.length == 2) {
			orcidList.clear();
			orcidList.add(handledorcidSplit[0] + handledorcidSplit[1]);
			addInList(orcidList, handledOname);
			if (checkListContainment(reportInfo, level, orcidList, paperList)) {
				return new Tuple2<>("check", reportInfo);
			}
			orcidList.clear();
			orcidList.add(handledorcidSplit[1] + handledorcidSplit[0]);
			addInList(orcidList, handledOname);
			if (checkListContainment(reportInfo, level, orcidList, paperList)) {
				return new Tuple2<>("check", reportInfo);
			}
		}
		if (handledresultSplit.length == 2) {
			orcidList.clear();
			addInList(orcidList, handledOname);
			addInList(orcidList, handledOsurname);
			paperList.clear();
			paperList.add(handledresultSplit[0] + handledresultSplit[1]);
			addInList(paperList, handledSurname);
			if (checkListContainment(reportInfo, level, orcidList, paperList))
				return new Tuple2<>("check", reportInfo);
			paperList.clear();
			paperList.add(handledresultSplit[1] + handledresultSplit[0]);
			addInList(paperList, handledSurname);
			if (checkListContainment(reportInfo, level, orcidList, paperList))
				return new Tuple2<>("check", reportInfo);
		}
		level++;

		if (handledOsurname.length() > 3 && handledSurname.length() > 3) {
			LevenshteinDistance l = new LevenshteinDistance();
			if (l.apply(handledOsurname, handledSurname) <= 2) {
				reportInfo.setLevel("level" + level);
				return new Tuple2<>("check", reportInfo);
			}
		}
		if (handledOsurname.length() > 3 && handledName.length() > 3) {
			LevenshteinDistance l = new LevenshteinDistance();
			if (l.apply(handledOsurname, handledName) <= 2) {
				reportInfo.setLevel("level" + level);
				return new Tuple2<>("check", reportInfo);
			}
		}

		return new Tuple2<>("wrong", reportInfo);
	}

	private static boolean checkListContainment(ReportInfo reportInfo, int level, List<String> orcidList,
		List<String> paperList) {
		if (orcidList.size() <= paperList.size()) {
			if (searchIn(paperList, orcidList)) {
				reportInfo.setLevel("level" + level);
				return true;
			}
		} else {
			if (searchIn(orcidList, paperList)) {
				reportInfo.setLevel("level" + level);
				return true;
			}
		}
		return false;
	}

	/**
	 * searches in list1 all the words of list 2
	 * @param list1 the list where to search for the words
	 * @param list2 the list containing the words to be searched
	 * @return true if all the words in list 2 are contained in list1
	 */
	private static boolean searchIn(List<String> list1, List<String> list2) {
		for (String word : list2) {
			if (!list1.contains(word)) {
				return false;
			}
		}
		return true;
	}

	private static void writeSet(Dataset<Tuple2<String, ReportInfo>> dataset, String outputPath) {
		dataset
			.groupByKey(
				(MapFunction<Tuple2<String, ReportInfo>, String>) value -> value._2().getOid(), Encoders.STRING())
			.mapGroups((MapGroupsFunction<String, Tuple2<String, ReportInfo>, ReportInfo>) (oid, tuple2Iterator) -> {
				ReportInfo reportInfo = tuple2Iterator.next()._2();
				List<ResultInfo> aList = reportInfo.getAssociatedAuthors();
				tuple2Iterator.forEachRemaining(tuple -> aList.addAll(tuple._2().getAssociatedAuthors()));

				reportInfo.setAssociatedAuthors(aList);
				return reportInfo;
			}, Encoders.bean(ReportInfo.class))
			.repartition(1)
			.write()
			.option("compression", "gzip")
			.mode(SaveMode.Overwrite)
			.json(outputPath);
	}

	private static <R> Dataset<R> readPath(
		SparkSession spark, String inputPath, Class<R> clazz) {
		return spark
			.read()
			.textFile(inputPath)
			.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
	}

	private static void removeOutputDir(SparkSession spark, String path) {
		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
	}

}