-

2020-11-16 10:53:12 +01:00 · 2020-11-16 10:53:12 +01:00 · c29d142087
parent 0f1a4f6637
commit c29d142087
10 changed files with 696 additions and 88 deletions
--- a/dhp-workflows/dhp-graph-mapper/pom.xml
+++ b/dhp-workflows/dhp-graph-mapper/pom.xml
@ -47,6 +47,12 @@
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-compress</artifactId>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-text -->
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-text</artifactId>
            <version>1.9</version>
        </dependency>
        <dependency>
            <groupId>commons-io</groupId>
@ -123,6 +129,21 @@
            <version>2.4.0.cloudera2</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/me.xdrop/fuzzywuzzy -->
        <dependency>
            <groupId>me.xdrop</groupId>
            <artifactId>fuzzywuzzy</artifactId>
            <version>1.3.1</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/com.intuit.fuzzymatcher/fuzzy-matcher -->
        <dependency>
            <groupId>com.intuit.fuzzymatcher</groupId>
            <artifactId>fuzzy-matcher</artifactId>
            <version>1.0.4</version>
        </dependency>
    </dependencies>
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/authorpids/MakeReportSparkJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/authorpids/MakeReportSparkJob.java
@ -5,9 +5,12 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
 import java.io.Serializable;
 import java.util.*;
 import java.util.stream.Collectors;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.text.similarity.CosineDistance;
 import org.apache.commons.text.similarity.LevenshteinDistance;
 import org.apache.hadoop.io.Text;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
@ -23,13 +26,49 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.intuit.fuzzymatcher.component.MatchService;
 import com.intuit.fuzzymatcher.domain.Document;
 import com.intuit.fuzzymatcher.domain.Element;
 import com.intuit.fuzzymatcher.domain.ElementType;
 import com.intuit.fuzzymatcher.domain.Match;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.common.PacePerson;
 import eu.dnetlib.dhp.schema.oaf.Result;
 import me.xdrop.fuzzywuzzy.FuzzySearch;
 import scala.Tuple2;
 /**
 * It checks if the orcid provided by ORCID and the one found in the result have the same author information. The
 * author information is handled before the checking. Handling steps:
 * words are lower-cased and trimmed, accents are replaced with their equivalent not accented. Only alfabethical
 * characters and white space are retained. All the other chars are substituted with space.
 *
 * The check is made on different specification levels:
 *
 * Level1: orcid author surname and result author surname are identical. We consider the match to be right
 *
 * Level2: we verify if orcid author surname contains result author surname or vice versa. If it is the case we consider
 * the match to be right
 *
 * Level3: we verify if one of the two surnames is composed by two words. In that case we concatenate the words and do
 * the checking again. If the two match, we consider the match to be checked
 *
 * Level4: name and surname can be inverted in one of the two entities. We consider the set of words composing the name
 * and the surname that are longer than 2 for orcid and result. If all the words of the shorter list are contained in
 * the longer one, we consider the match to be checked
 *
 * Level5: name and surname are inverted but one of the two is composed by two words. Mix of Level3 and level4. We consider
 * the match to be checked
 *
 * Level6: surnames differ for some chars. We apply the levenstein distance on surnames if their lenght is bigger than 3.
 * If the distance is less than 2 we consider the match to be checked
 *
 * In all the other cases the match is considered wrong
 *
 */
 public class MakeReportSparkJob implements Serializable {
 	private static final Logger log = LoggerFactory.getLogger(MakeReportSparkJob.class);
@ -105,13 +144,25 @@ public class MakeReportSparkJob implements Serializable {
 	private static void addInList(List<String> list, String to_add) {
 		for (String word : to_add.split(" ")) {
-			if (word.length() >= 2) {
+			if (word.length() > 2) {
 				list.add(word);
 			}
 		}
 	}
 	public static String handleNameSurname(String input) {
 		input = input.toLowerCase().replace(".", "");
 		if (input.startsWith("dr")) {
 			input = input.substring(3);
 		}
 		return StringUtils
 			.stripAccents(input.trim())
 			.replaceAll("[^a-z\\s]+", " ")
 			.trim();
 	}
 	private static <I extends Result> void makeReport(SparkSession spark, String inputPath, Class<I> entityClazz,
 		String outputPath, String preparedInfoPath,
 		Dataset<OrcidAuthotitative> authoritative) {
@ -125,84 +176,7 @@ public class MakeReportSparkJob implements Serializable {
 					.equalTo(resultInfo.col("orcid")),
 				"left")
 			.map((MapFunction<Tuple2<ResultInfo, OrcidAuthotitative>, Tuple2<String, ReportInfo>>) pair -> {
-				Optional<OrcidAuthotitative> ooa = Optional.ofNullable(pair._2());
+				return getStringReportInfoFuzzyTuple2(pair);
 				if (!ooa.isPresent()) {
 					return null;
 				}
 				OrcidAuthotitative oa = ooa.get();
 				ResultInfo ri = pair._1();
 				if (StringUtils.isBlank(ri.getSurname())) {
 					PacePerson pp = new PacePerson(ri.getFullname(), false);
 					ri.setSurname(pp.getNormalisedSurname());
 					ri.setName(pp.getNormalisedFirstName());
 				}
 				ReportInfo reportInfo = new ReportInfo();
 				reportInfo.setOid(oa.getOid());
 				reportInfo.setOname(oa.getName());
 				reportInfo.setOsurname(oa.getSurname());
 				reportInfo.setOcreditname(oa.getCreditname());
 				reportInfo.setAssociatedAuthors(Arrays.asList(ri));
 				if (!Optional.ofNullable(oa.getSurname()).isPresent()) {
 					return new Tuple2<>("missing", reportInfo);
 				}
 				final String handledOsurname = StringUtils
 					.stripAccents(oa.getSurname().toLowerCase().trim())
 					.replace("-", " ")
 					.replace(".", "");
 				final String handledSurname = StringUtils
 					.stripAccents(ri.getSurname().toLowerCase().trim())
 					.replace("-", " ")
 					.replace(".", "");
 				if (!handledOsurname
 					.equalsIgnoreCase(handledSurname)) {
 					if (!handledOsurname.contains(handledSurname) && !handledSurname.contains(handledOsurname)) {
 						// check if the words composing the name and the surname are the same or one list contains the
 						// other.
 						// do for words of lenght bigger than two
 						String handledOname = "";
 						if (Optional.ofNullable(oa.getName()).isPresent()) {
 							handledOname = StringUtils
 								.stripAccents(oa.getName().toLowerCase().trim())
 								.replace("-", " ")
 								.replace(".", "");
 						}
 						String handledName = "";
 						if (Optional.ofNullable(ri.getName()).isPresent()) {
 							handledName = StringUtils
 								.stripAccents(ri.getName().toLowerCase().trim())
 								.replace("-", " ")
 								.replace(".", "");
 						}
 						final List<String> orcidList = new ArrayList<>();
 						final List<String> paperList = new ArrayList<>();
 						addInList(orcidList, handledOname);
 						addInList(orcidList, handledOsurname);
 						addInList(paperList, handledSurname);
 						addInList(paperList, handledName);
 						if (orcidList.size() <= paperList.size()) {
 							if (searchIn(paperList, orcidList)) {
 								return new Tuple2<>("check", reportInfo);
 							}
 						} else {
 							if (searchIn(orcidList, paperList)) {
 								return new Tuple2<>("check", reportInfo);
 							}
 						}
 						// todo add another level of checking (use levenstein)
 						return new Tuple2<>("wrong", reportInfo);
 					}
 					return new Tuple2<>("right", reportInfo);
 				}
 				return new Tuple2<>("right", reportInfo);
 			}, Encoders.tuple(Encoders.STRING(), Encoders.bean(ReportInfo.class)))
 			.filter(Objects::nonNull);
@ -222,6 +196,280 @@ public class MakeReportSparkJob implements Serializable {
 	}
 	private static double fuzzyMatch(String orcid, String result) {
 		// apply one or more fuzzy functions to determine if the input string match
 		// match 1.0 con fuzzy => giusti
 		// quelli che matchano sopra 0.66 con fuzzy li metto fra i giusti
 		// quelli che non stanno nel match di prima, ma matchano fuzzywizzy sopra 0.5 li metto in check
 		// probabilmente giusti
 		// quelli che matchano fuzzywizzy da 0.5 a 0.3
 		return 0;
 	}
 	public static Tuple2<String, ReportInfo> getStringReportInfoFuzzyTuple2(
 		Tuple2<ResultInfo, OrcidAuthotitative> pair) {
 		Optional<OrcidAuthotitative> ooa = Optional.ofNullable(pair._2());
 		if (!ooa.isPresent()) {
 			return null;
 		}
 		OrcidAuthotitative oa = ooa.get();
 		ResultInfo ri = pair._1();
 		if (StringUtils.isBlank(ri.getSurname())) {
 			PacePerson pp = new PacePerson(ri.getFullname(), false);
 			ri.setSurname(pp.getNormalisedSurname());
 			ri.setName(pp.getNormalisedFirstName());
 		}
 		ReportInfo reportInfo = new ReportInfo();
 		reportInfo.setOid(oa.getOid());
 		reportInfo.setOname(oa.getName());
 		reportInfo.setOsurname(oa.getSurname());
 		reportInfo.setOcreditname(oa.getCreditName());
 		reportInfo.setAssociatedAuthors(Arrays.asList(ri));
 		int level = 1;
 		if (!Optional.ofNullable(oa.getSurname()).isPresent()) {
 			return new Tuple2<>("missing", reportInfo);
 		}
 		final String handledOsurname = handleNameSurname(oa.getSurname());
 		if (handledOsurname.equalsIgnoreCase("")) {
 			return new Tuple2<>("missing", reportInfo);
 		}
 		final String handledSurname = handleNameSurname(ri.getSurname());
 		if (handledSurname.equals("")) {
 			return new Tuple2<>("missing", reportInfo);
 		}
 		String handledOname = "";
 		if (Optional.ofNullable(oa.getName()).isPresent()) {
 			handledOname = handleNameSurname(oa.getName());
 		}
 		String handledName = "";
 		if (Optional.ofNullable(ri.getName()).isPresent()) {
 			handledName = handleNameSurname(ri.getName());
 		}
 		String[][] input = {
 			{
 				"1", handledOsurname + " " + handledOname
 			},
 			{
 				"2", handledSurname + " " + handledName
 			}
 		};
 		// check if there is neither a common word. If there is not they are obviously wrong
 		if (Math.round((1 - new CosineDistance().apply(input[0][1], input[1][1])) * 100) == 0) {
 			MatchService matchService = new MatchService();
 			List<Document> documentList = Arrays.asList(input).stream().map(contact -> {
 				return new Document.Builder(contact[0])
 					.addElement(
 						new Element.Builder<String>()
 							.setValue(contact[1])
 							.setType(ElementType.NAME)
 							.createElement())
 					.createDocument();
 			}).collect(Collectors.toList());
 			if (matchService.applyMatchByDocId(documentList).entrySet().size() == 0) {
 				if (FuzzySearch.ratio(input[0][1], input[1][1]) < 30) {
 					return new Tuple2<>("wrong", reportInfo);
 				}
 			}
 		}
 //		// they have some words in common. check if orcid provides creditName or otherNames to check for distance
 //		//
 //		List<Document> documentList = Arrays.asList(input).stream().map(contact -> {
 //			return new Document.Builder(contact[0])
 //				.addElement(
 //					new Element.Builder<String>()
 //						.setValue(contact[1])
 //						.setType(ElementType.NAME)
 //						.createElement())
 //				.createDocument();
 //		}).collect(Collectors.toList());
 //
 //		MatchService matchService = new MatchService();
 //
 //		Map<String, List<Match<Document>>> result = matchService.applyMatchByDocId(documentList);
 //
 //		if (result.entrySet().size() > 0) {
 //			reportInfo.setLevel("fuzzyMatch");
 //			return new Tuple2<>("right", reportInfo);
 //		}
 		return new Tuple2<>("check", reportInfo);
 	}
 	public static Tuple2<String, ReportInfo> getStringReportInfoTuple2(Tuple2<ResultInfo, OrcidAuthotitative> pair) {
 		Optional<OrcidAuthotitative> ooa = Optional.ofNullable(pair._2());
 		if (!ooa.isPresent()) {
 			return null;
 		}
 		OrcidAuthotitative oa = ooa.get();
 		ResultInfo ri = pair._1();
 		if (StringUtils.isBlank(ri.getSurname())) {
 			PacePerson pp = new PacePerson(ri.getFullname(), false);
 			ri.setSurname(pp.getNormalisedSurname());
 			ri.setName(pp.getNormalisedFirstName());
 		}
 		ReportInfo reportInfo = new ReportInfo();
 		reportInfo.setOid(oa.getOid());
 		reportInfo.setOname(oa.getName());
 		reportInfo.setOsurname(oa.getSurname());
 		reportInfo.setOcreditname(oa.getCreditName());
 		reportInfo.setAssociatedAuthors(Arrays.asList(ri));
 		int level = 1;
 		if (!Optional.ofNullable(oa.getSurname()).isPresent()) {
 			return new Tuple2<>("missing", reportInfo);
 		}
 		final String handledOsurname = handleNameSurname(oa.getSurname());
 		if (handledOsurname.equalsIgnoreCase("")) {
 			return new Tuple2<>("missing", reportInfo);
 		}
 		final String handledSurname = handleNameSurname(ri.getSurname());
 		if (handledSurname.equals("")) {
 			return new Tuple2<>("missing", reportInfo);
 		}
 		// check if oSurname and surname are equals
 		if (handledOsurname.equals(handledSurname)) {
 			reportInfo.setLevel("level" + level);
 			return new Tuple2<>("right", reportInfo);
 		}
 		level++;
 		// check if one is contained in the other
 		if (handledOsurname.contains(handledSurname) || handledSurname.contains(handledOsurname)) {
 			reportInfo.setLevel("level" + level);
 			return new Tuple2<>("right", reportInfo);
 		}
 		level++;
 		// check if one of the two is composed of more than one word. In this case concatenate the two words
 		// and check again (Mohammadi Peyhani vs Mohammadipeyhani)
 		String[] handledorcidSplit = handledOsurname.split(" ");
 		String[] handledresultSplit = handledSurname.split(" ");
 		if (handledorcidSplit.length == 2) {
 			String tmpSurname = handledorcidSplit[0] + handledorcidSplit[1];
 			if (tmpSurname.equals(handledSurname)) {
 				reportInfo.setLevel("level" + level);
 				return new Tuple2<>("check", reportInfo);
 			}
 		}
 		if (handledresultSplit.length == 2) {
 			String tmpSurname = handledresultSplit[0] + handledresultSplit[1];
 			if (tmpSurname.equals(handledSurname)) {
 				reportInfo.setLevel("level" + level);
 				return new Tuple2<>("check", reportInfo);
 			}
 		}
 		level++;
 		// check if the words composing the name and the surname are the same or one list contains the
 		// other.
 		// do for words of lenght bigger than two
 		String handledOname = "";
 		if (Optional.ofNullable(oa.getName()).isPresent()) {
 			handledOname = handleNameSurname(oa.getName());
 		}
 		String handledName = "";
 		if (Optional.ofNullable(ri.getName()).isPresent()) {
 			handledName = handleNameSurname(ri.getName());
 		}
 		final List<String> orcidList = new ArrayList<>();
 		final List<String> paperList = new ArrayList<>();
 		addInList(orcidList, handledOname);
 		addInList(orcidList, handledOsurname);
 		addInList(paperList, handledSurname);
 		addInList(paperList, handledName);
 		if (checkListContainment(reportInfo, level, orcidList, paperList))
 			return new Tuple2<>("check", reportInfo);
 		level++;
 		handledorcidSplit = handledOsurname.split(" ");
 		handledresultSplit = handledName.split(" ");
 		if (handledorcidSplit.length == 2) {
 			orcidList.clear();
 			orcidList.add(handledorcidSplit[0] + handledorcidSplit[1]);
 			addInList(orcidList, handledOname);
 			if (checkListContainment(reportInfo, level, orcidList, paperList)) {
 				return new Tuple2<>("check", reportInfo);
 			}
 			orcidList.clear();
 			orcidList.add(handledorcidSplit[1] + handledorcidSplit[0]);
 			addInList(orcidList, handledOname);
 			if (checkListContainment(reportInfo, level, orcidList, paperList)) {
 				return new Tuple2<>("check", reportInfo);
 			}
 		}
 		if (handledresultSplit.length == 2) {
 			orcidList.clear();
 			addInList(orcidList, handledOname);
 			addInList(orcidList, handledOsurname);
 			paperList.clear();
 			paperList.add(handledresultSplit[0] + handledresultSplit[1]);
 			addInList(paperList, handledSurname);
 			if (checkListContainment(reportInfo, level, orcidList, paperList))
 				return new Tuple2<>("check", reportInfo);
 			paperList.clear();
 			paperList.add(handledresultSplit[1] + handledresultSplit[0]);
 			addInList(paperList, handledSurname);
 			if (checkListContainment(reportInfo, level, orcidList, paperList))
 				return new Tuple2<>("check", reportInfo);
 		}
 		level++;
 		if (handledOsurname.length() > 3 && handledSurname.length() > 3) {
 			LevenshteinDistance l = new LevenshteinDistance();
 			if (l.apply(handledOsurname, handledSurname) <= 2) {
 				reportInfo.setLevel("level" + level);
 				return new Tuple2<>("check", reportInfo);
 			}
 		}
 		if (handledOsurname.length() > 3 && handledName.length() > 3) {
 			LevenshteinDistance l = new LevenshteinDistance();
 			if (l.apply(handledOsurname, handledName) <= 2) {
 				reportInfo.setLevel("level" + level);
 				return new Tuple2<>("check", reportInfo);
 			}
 		}
 		return new Tuple2<>("wrong", reportInfo);
 	}
 	private static boolean checkListContainment(ReportInfo reportInfo, int level, List<String> orcidList,
 		List<String> paperList) {
 		if (orcidList.size() <= paperList.size()) {
 			if (searchIn(paperList, orcidList)) {
 				reportInfo.setLevel("level" + level);
 				return true;
 			}
 		} else {
 			if (searchIn(orcidList, paperList)) {
 				reportInfo.setLevel("level" + level);
 				return true;
 			}
 		}
 		return false;
 	}
 	/**
 	 * searches in list1 all the words of list 2
 	 * @param list1 the list where to search for the words
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/authorpids/OrcidAuthotitative.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/authorpids/OrcidAuthotitative.java
@ -2,19 +2,47 @@
 package eu.dnetlib.dhp.oa.graph.clean.authorpids;
 import java.io.Serializable;
 import java.util.List;
 public class OrcidAuthotitative implements Serializable {
 	private String oid;
 	private String name;
 	private String surname;
-	private String creditname;
+	private String creditName;
 	private String otherName;
 	private List<String> otherNames;
 	private String errorCode;
-	public String getCreditname() {
+	public String getOtherName() {
-		return creditname;
+		return otherName;
 	}
-	public void setCreditname(String creditname) {
+	public void setOtherName(String otherName) {
-		this.creditname = creditname;
+		this.otherName = otherName;
 	}
 	public List<String> getOtherNames() {
 		return otherNames;
 	}
 	public void setOtherNames(List<String> otherNames) {
 		this.otherNames = otherNames;
 	}
 	public String getErrorCode() {
 		return errorCode;
 	}
 	public void setErrorCode(String errorCode) {
 		this.errorCode = errorCode;
 	}
 	public String getCreditName() {
 		return creditName;
 	}
 	public void setCreditName(String creditName) {
 		this.creditName = creditName;
 	}
 	public String getOid() {
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/authorpids/PrepareResultsSparkJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/authorpids/PrepareResultsSparkJob.java
@ -73,7 +73,7 @@ public class PrepareResultsSparkJob implements Serializable {
 		result.createOrReplaceTempView("result");
-		String query = "select auth.name name, auth.surname surname, auth.fullname fullname, pIde.value orcid, id, cf.value collectedfrom"
+		String query = "select auth.name name, auth.surname surname, auth.fullname fullname, pIde.value orcid, id, cf.value collectedfrom "
 			+
 			"from result " +
 			"lateral view explode(author) a as auth " +
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/authorpids/ReportInfo.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/authorpids/ReportInfo.java
@ -12,6 +12,16 @@ public class ReportInfo implements Serializable {
 	private List<ResultInfo> associatedAuthors;
 	private String level;
 	public String getLevel() {
 		return level;
 	}
 	public void setLevel(String level) {
 		this.level = level;
 	}
 	public String getOid() {
 		return oid;
 	}
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/oozie_app/workflow.xml
@ -404,7 +404,7 @@
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
            <arg>--preparedInfoPath</arg><arg>${workingDir}/dataset</arg>
-            <arg>--outputPath</arg><arg>${utputPath}/dataset</arg>
+            <arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
            <arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
            <arg>--orcidInputPath</arg><arg>${orcidInputPath}</arg>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/oozie_app/import.txt
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/oozie_app/import.txt
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/oozie_app/workflow.xml
@ -182,7 +182,7 @@
    <decision name="cleanorreport">
        <switch>
            <case to="make_report">${wf:conf('clean') eq false}</case>
-            <case to="clean_orcid_copy">${wf:conf('clean') eq true}</case>
+            <case to="clean_orcid">${wf:conf('clean') eq true}</case>
            <default to="make_report"/>
        </switch>
    </decision>
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleanOrcidTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleanOrcidTest.java
@ -1,11 +1,19 @@
 package eu.dnetlib.dhp.oa.graph.clean;
-import java.io.IOException;
+import java.io.*;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
 import java.util.function.Function;
 import java.util.stream.Collectors;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.text.similarity.CosineDistance;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
 import org.apache.spark.SparkConf;
@ -21,8 +29,19 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.gson.Gson;
 import com.intuit.fuzzymatcher.component.MatchService;
 import com.intuit.fuzzymatcher.domain.Document;
 import com.intuit.fuzzymatcher.domain.Element;
 import com.intuit.fuzzymatcher.domain.ElementType;
 import com.intuit.fuzzymatcher.domain.Match;
 import com.wcohen.ss.Levenstein;
 import eu.dnetlib.dhp.oa.graph.clean.authorpids.*;
 import jdk.nashorn.internal.ir.annotations.Ignore;
 import me.xdrop.fuzzywuzzy.FuzzySearch;
 import net.sf.saxon.trans.Maker;
 import scala.Tuple2;
 public class CleanOrcidTest {
@ -34,6 +53,106 @@ public class CleanOrcidTest {
 	private static final Logger log = LoggerFactory.getLogger(CleanOrcidTest.class);
 	// needed for fuzzywuzzy to get a lower bound ratio under which the authors are most probably different
 	String[][] wrong = {
 		{
 			"1", MakeReportSparkJob.handleNameSurname("Alex Bullock")
 		},
 		{
 			"2", MakeReportSparkJob.handleNameSurname("Gillian Farnie")
 		},
 		{
 			"3", MakeReportSparkJob.handleNameSurname("Luís Rocha")
 		},
 		{
 			"4", MakeReportSparkJob.handleNameSurname("Pedro Relvas")
 		},
 		{
 			"9", MakeReportSparkJob.handleNameSurname("Prasanth Manohar")
 		},
 		{
 			"10", MakeReportSparkJob.handleNameSurname("Nachimuthu	Ramesh")
 		}
 	};
 	String[][] input = {
 		{
 			"1", MakeReportSparkJob.handleNameSurname("Dr. Ulrike Elsdoerfer Ph.D.")
 		},
 		{
 			"2", MakeReportSparkJob.handleNameSurname("Ulrike Elsdörfer")
 		},
 		{
 			"3", MakeReportSparkJob.handleNameSurname("Steven Ossont")
 		},
 		{
 			"4", MakeReportSparkJob.handleNameSurname("Steven J. Johnston")
 		},
 		{
 			"5", MakeReportSparkJob.handleNameSurname("Joanna Molyn")
 		},
 		{
 			"6", MakeReportSparkJob.handleNameSurname("Joanna Molyn-Blanchfield")
 		},
 		{
 			"7", MakeReportSparkJob.handleNameSurname("Zhang Tian-Tuo")
 		},
 		{
 			"8", MakeReportSparkJob.handleNameSurname("Zhang Tiantuo")
 		},
 		{
 			"9", MakeReportSparkJob.handleNameSurname("Prasanth Manohar")
 		},
 		{
 			"10", MakeReportSparkJob.handleNameSurname("Nachimuthu	Ramesh")
 		},
 		{
 			"9", MakeReportSparkJob.handleNameSurname("Hassan Ahmed")
 		},
 		{
 			"10", MakeReportSparkJob.handleNameSurname("Hassan Mohamed")
 		},
 		{
 			"11", MakeReportSparkJob.handleNameSurname("Jonathan ODonnell")
 		},
 		{
 			"12", MakeReportSparkJob.handleNameSurname("Jonathon A. O Dannell")
 		},
 		{
 			"11", MakeReportSparkJob.handleNameSurname("Amilcar António Teiga Teixeira")
 		},
 		{
 			"12", MakeReportSparkJob.handleNameSurname("Amílcar Teixeira")
 		},
 		{
 			"13", MakeReportSparkJob.handleNameSurname("Bruno Rossion")
 		},
 		{
 			"14", MakeReportSparkJob.handleNameSurname("B. Rossion")
 		},
 		{
 			"15", MakeReportSparkJob.handleNameSurname("TINGYOU	WANG")
 		},
 		{
 			"16", MakeReportSparkJob.handleNameSurname("Wang Ting-You")
 		},
 		{
 			"17", MakeReportSparkJob.handleNameSurname("Jacob	Moran-Gilad")
 		},
 		{
 			"18", MakeReportSparkJob.handleNameSurname("Moran-Gilad Jacon")
 		},
 		{
 			"19", MakeReportSparkJob.handleNameSurname("Adelle Semmler")
 		},
 		{
 			"20", MakeReportSparkJob.handleNameSurname("Adelle Craig")
 		}
 	};
 	@BeforeAll
 	public static void beforeAll() throws IOException {
 		workingDir = Files.createTempDirectory(CleanOrcidTest.class.getSimpleName());
@ -168,4 +287,151 @@ public class CleanOrcidTest {
 			.map(item -> OBJECT_MAPPER.readValue(item, ResultInfo.class));
 	}
 	@Test
 	public void cleanNameSurname() {
 		String name = "Hübner";
 		String surname = "Hubenr";
 		name = StringUtils
 			.stripAccents(name.toLowerCase().trim())
 			.replaceAll("[^a-z\\s]+", " ");
 		surname = StringUtils
 			.stripAccents(surname.toLowerCase().trim())
 			.replace(".", "")
 			.replaceAll("[^a-z\\s]+", " ")
 			.replace("'", " ")
 			.trim();
 		Levenstein l = new Levenstein();
 		double score = Math.abs(l.score(name, surname));
 		System.out.println(score);
 	}
 	@Test
 	public void testMakeReport() {
 		ResultInfo ri = new ResultInfo();
 		ri.setName("Prasanth");
 		ri.setSurname("Manohar");
 		OrcidAuthotitative oa = new OrcidAuthotitative();
 		oa.setName("Nachimuthu");
 		oa.setSurname("Ramesh");
 		Tuple2<ResultInfo, OrcidAuthotitative> t2 = new Tuple2<ResultInfo, OrcidAuthotitative>(ri, oa);
 		Tuple2<String, ReportInfo> tmp = MakeReportSparkJob.getStringReportInfoFuzzyTuple2(t2);
 		System.out.println(new Gson().toJson(tmp._2(), ReportInfo.class));
 	}
 	@Test
 	public void cosineDistanceTest() {
 		for (int i = 0; i < input.length; i += 2) {
 			double cosineDistance = new CosineDistance().apply(input[i][1], input[i + 1][1]);
 			System.out
 				.println(
 					"CosineDistance of '" + input[i][1] + "' & '" + input[i + 1][1] + "' | Words in strings are "
 						+ Math.round(cosineDistance * 100) + "% dis-similar or "
 						+ Math.round((1 - cosineDistance) * 100) + "% similar.");
 		}
 	}
 	@Test
 	public void testAuthorFuzzyMatch() {
 		Function<String, String> clean = s -> MakeReportSparkJob.handleNameSurname(s);
 		List<Document> documentList = Arrays.asList(input).stream().map(contact -> {
 			return new Document.Builder(contact[0])
 				.addElement(
 					new Element.Builder<String>()
 						.setValue(contact[1])
 						.setType(ElementType.NAME)
 						.setPreProcessingFunction(clean)
 						.createElement())
 				.createDocument();
 		}).collect(Collectors.toList());
 		MatchService matchService = new MatchService();
 		Map<String, List<Match<Document>>> result = matchService.applyMatchByDocId(documentList);
 		result.entrySet().forEach(entry -> {
 			entry.getValue().forEach(match -> {
 				System.out
 					.println(
 						"Data: " + match.getData() + " Matched With: " + match.getMatchedWith() + " Score: "
 							+ match.getScore().getResult());
 			});
 		});
 	}
 	@Test
 	public void FuzzyWuzzyTest() {
 		applyFuzzyWuzzy(input);
 	}
 	@Test
 	public void FuzzyWuzzyWrongTest() throws IOException {
 		final String inputPath = getClass()
 			.getResource("/eu/dnetlib/dhp/oa/graph/clean/wrongassociation.json")
 			.getPath();
 		BufferedReader reader = new BufferedReader(new FileReader(inputPath));
 		String line;
 		List<OrcidAuthor> orcidAuthorList = new ArrayList<>();
 		while (null != (line = reader.readLine())) {
 			orcidAuthorList.add(new Gson().fromJson(line, OrcidAuthor.class));
 		}
 		applyFuzzyWuzzy(orcidAuthorList);
 	}
 	private void applyFuzzyWuzzy(List<OrcidAuthor> orcidAuthorList) {
 		orcidAuthorList.forEach(entry -> {
 			String orcid = MakeReportSparkJob.handleNameSurname(entry.getOrcid());
 			String result = MakeReportSparkJob.handleNameSurname(entry.getResult());
 			System.out
 				.println(
 					"FuzzyWuzzy of  '" + orcid + "' & '" + result + "' | Similarity ratio  "
 						+ FuzzySearch.ratio(orcid, result));
 		});
 	}
 	private void applyFuzzyWuzzy(String[][] input) {
 		for (int i = 0; i < input.length; i += 2) {
 			System.out
 				.println(
 					"FuzzyWuzzy of  '" + input[i][1] + "' & '" + input[i + 1][1] + "' | Similarity ratio  "
 						+ FuzzySearch.ratio(input[i][1], input[i + 1][1]));
 		}
 	}
 	class OrcidAuthor implements Serializable {
 		private String orcid;
 		private String result;
 		public String getOrcid() {
 			return orcid;
 		}
 		public void setOrcid(String orcid) {
 			this.orcid = orcid;
 		}
 		public String getResult() {
 			return result;
 		}
 		public void setResult(String result) {
 			this.result = result;
 		}
 	}
 }
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/wrongassociation.json
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/wrongassociation.json
@ -0,0 +1,35 @@
 {"orcid":"Alex Bullock" ,"result": "Gillian Farnie"}
 {"orcid": "Luís Rocha", "result":"Pedro Relvas"}
 {"orcid": "Prasanth Manohar", "result": "Nachimuthu	Ramesh"}
 {"orcid": "Zhiying Lin", "result":"Guanglong Huang"}
 {"orcid":"Andrew Golnar","result":"Kim Pepin"}
 {"orcid": "Gilles Marcou", "result":"Filippo Lunghini"}
 {"orcid": "Philip Hahn", "result":"John Maron"}
 {"orcid": "Kirsty Gibson", "result":"Kim R. Hardie"}
 {"orcid": "Paula Lago", "result":"Shingo Takeda"}
 {"orcid": "Paul Seidler", "result":"Dalziel J. Wilson"}
 {"orcid": "Solomon Okunade", "result":"Rufus Adebayo Ajisafe"}
 {"orcid": "Emi Arai", "result":"Masaru Hasegawa"}
 {"orcid": "Dr Muhammad Yameen Sandhu", "result":"Nutapong Somjit"}
 {"orcid": "Xianlei Cai", "result":"Weiming Yu"}
 {"orcid": "Bing He", "result":"Chuan Xing"}
 {"orcid": "JULIEN COURCHET", "result":"Franck Polleux"}
 {"orcid": "Xiaoyun Pan", "result":"Liru Chen"}
 {"orcid": "Marianne Okal", "result":"Brendan Hodge"}
 {"orcid": "Michal Fereczkowski", "result":"Silje Grini Nielsen"}
 {"orcid": "Nobuyuki Nakai", "result":"Tadafumi Kurogi"}
 {"orcid": "Colin Daniel", "result":"Christine Cuyler"}
 {"orcid": "Xavier Arnan", "result":"Anna Torné-Noguera"}
 {"orcid": "Denita Hadziabdic", "result":"Meher Ony"}
 {"orcid": "Kor de Jong", "result":"K. Koning"}
 {"orcid": "Chaya Patel", "result":"David Leib"}
 {"orcid": "Fagner Carniel", "result":"Adonai Lacruz"}
 {"orcid": "Carrie Peltz", "result":"Erica Kornblith"}
 {"orcid": "Kathryn Huyvaert", "result":"Larissa L. Bailey"}
 {"orcid": "Christine Provost", "result":"Nathalie Sennéchael"}
 {"orcid": "Nancy Pachana", "result":"Lisa DiNatale"}
 {"orcid": "ARDESHIR BAYAT", "result":"P. Marcos Gorresen"}
 {"orcid": "Paul Berkowitz", "result":"Silje Grini Nielsen"}
 {"orcid": "Alice Laciny", "result":"Brian Metscher"}
 {"orcid": "Octavio Rojas", "result":"Josie A. Griffin"}
 {"orcid": "Carlo Sandroni", "result":"Riccardo Scattolini"}