-

2020-11-16 10:53:12 +01:00 · 2020-11-16 10:53:12 +01:00 · c29d142087
parent 0f1a4f6637
commit c29d142087
10 changed files with 696 additions and 88 deletions
--- a/dhp-workflows/dhp-graph-mapper/pom.xml
+++ b/dhp-workflows/dhp-graph-mapper/pom.xml
@ -47,6 +47,12 @@
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-compress</artifactId>
        </dependency>
+        <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-text -->
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-text</artifactId>
+            <version>1.9</version>
+        </dependency>

        <dependency>
            <groupId>commons-io</groupId>
@ -123,6 +129,21 @@
            <version>2.4.0.cloudera2</version>
        </dependency>

+        <!-- https://mvnrepository.com/artifact/me.xdrop/fuzzywuzzy -->
+        <dependency>
+            <groupId>me.xdrop</groupId>
+            <artifactId>fuzzywuzzy</artifactId>
+            <version>1.3.1</version>
+        </dependency>
+
+        <!-- https://mvnrepository.com/artifact/com.intuit.fuzzymatcher/fuzzy-matcher -->
+        <dependency>
+            <groupId>com.intuit.fuzzymatcher</groupId>
+            <artifactId>fuzzy-matcher</artifactId>
+            <version>1.0.4</version>
+        </dependency>
+
+
    </dependencies>


--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/authorpids/MakeReportSparkJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/authorpids/MakeReportSparkJob.java
@ -5,9 +5,12 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;

 import java.io.Serializable;
 import java.util.*;
+import java.util.stream.Collectors;

 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.text.similarity.CosineDistance;
+import org.apache.commons.text.similarity.LevenshteinDistance;
 import org.apache.hadoop.io.Text;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
@ -23,13 +26,49 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import com.fasterxml.jackson.databind.ObjectMapper;
+import com.intuit.fuzzymatcher.component.MatchService;
+import com.intuit.fuzzymatcher.domain.Document;
+import com.intuit.fuzzymatcher.domain.Element;
+import com.intuit.fuzzymatcher.domain.ElementType;
+import com.intuit.fuzzymatcher.domain.Match;

 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.common.PacePerson;
 import eu.dnetlib.dhp.schema.oaf.Result;
+import me.xdrop.fuzzywuzzy.FuzzySearch;
 import scala.Tuple2;

+/**
+ * It checks if the orcid provided by ORCID and the one found in the result have the same author information. The
+ * author information is handled before the checking. Handling steps:
+ * words are lower-cased and trimmed, accents are replaced with their equivalent not accented. Only alfabethical
+ * characters and white space are retained. All the other chars are substituted with space.
+ *
+ * The check is made on different specification levels:
+ *
+ * Level1: orcid author surname and result author surname are identical. We consider the match to be right
+ *
+ * Level2: we verify if orcid author surname contains result author surname or vice versa. If it is the case we consider
+ * the match to be right
+ *
+ * Level3: we verify if one of the two surnames is composed by two words. In that case we concatenate the words and do
+ * the checking again. If the two match, we consider the match to be checked
+ *
+ * Level4: name and surname can be inverted in one of the two entities. We consider the set of words composing the name
+ * and the surname that are longer than 2 for orcid and result. If all the words of the shorter list are contained in
+ * the longer one, we consider the match to be checked
+ *
+ * Level5: name and surname are inverted but one of the two is composed by two words. Mix of Level3 and level4. We consider
+ * the match to be checked
+ *
+ * Level6: surnames differ for some chars. We apply the levenstein distance on surnames if their lenght is bigger than 3.
+ * If the distance is less than 2 we consider the match to be checked
+ *
+ * In all the other cases the match is considered wrong
+ *
+ */
+
 public class MakeReportSparkJob implements Serializable {
 	private static final Logger log = LoggerFactory.getLogger(MakeReportSparkJob.class);

@ -105,13 +144,25 @@ public class MakeReportSparkJob implements Serializable {

 	private static void addInList(List<String> list, String to_add) {
 		for (String word : to_add.split(" ")) {
-			if (word.length() >= 2) {
+			if (word.length() > 2) {
 				list.add(word);
 			}
 		}

 	}

+	public static String handleNameSurname(String input) {
+		input = input.toLowerCase().replace(".", "");
+		if (input.startsWith("dr")) {
+			input = input.substring(3);
+		}
+
+		return StringUtils
+			.stripAccents(input.trim())
+			.replaceAll("[^a-z\\s]+", " ")
+			.trim();
+	}
+
 	private static <I extends Result> void makeReport(SparkSession spark, String inputPath, Class<I> entityClazz,
 		String outputPath, String preparedInfoPath,
 		Dataset<OrcidAuthotitative> authoritative) {
@ -125,84 +176,7 @@ public class MakeReportSparkJob implements Serializable {
 					.equalTo(resultInfo.col("orcid")),
 				"left")
 			.map((MapFunction<Tuple2<ResultInfo, OrcidAuthotitative>, Tuple2<String, ReportInfo>>) pair -> {
-				Optional<OrcidAuthotitative> ooa = Optional.ofNullable(pair._2());
-				if (!ooa.isPresent()) {
-					return null;
-				}
-				OrcidAuthotitative oa = ooa.get();
-
-				ResultInfo ri = pair._1();
-
-				if (StringUtils.isBlank(ri.getSurname())) {
-					PacePerson pp = new PacePerson(ri.getFullname(), false);
-					ri.setSurname(pp.getNormalisedSurname());
-					ri.setName(pp.getNormalisedFirstName());
-				}
-				ReportInfo reportInfo = new ReportInfo();
-				reportInfo.setOid(oa.getOid());
-				reportInfo.setOname(oa.getName());
-				reportInfo.setOsurname(oa.getSurname());
-				reportInfo.setOcreditname(oa.getCreditname());
-				reportInfo.setAssociatedAuthors(Arrays.asList(ri));
-
-				if (!Optional.ofNullable(oa.getSurname()).isPresent()) {
-					return new Tuple2<>("missing", reportInfo);
-				}
-				final String handledOsurname = StringUtils
-					.stripAccents(oa.getSurname().toLowerCase().trim())
-					.replace("-", " ")
-					.replace(".", "");
-				final String handledSurname = StringUtils
-					.stripAccents(ri.getSurname().toLowerCase().trim())
-					.replace("-", " ")
-					.replace(".", "");
-				if (!handledOsurname
-					.equalsIgnoreCase(handledSurname)) {
-					if (!handledOsurname.contains(handledSurname) && !handledSurname.contains(handledOsurname)) {
-						// check if the words composing the name and the surname are the same or one list contains the
-						// other.
-						// do for words of lenght bigger than two
-						String handledOname = "";
-						if (Optional.ofNullable(oa.getName()).isPresent()) {
-							handledOname = StringUtils
-								.stripAccents(oa.getName().toLowerCase().trim())
-								.replace("-", " ")
-								.replace(".", "");
-						}
-						String handledName = "";
-						if (Optional.ofNullable(ri.getName()).isPresent()) {
-							handledName = StringUtils
-								.stripAccents(ri.getName().toLowerCase().trim())
-								.replace("-", " ")
-								.replace(".", "");
-						}
-
-						final List<String> orcidList = new ArrayList<>();
-						final List<String> paperList = new ArrayList<>();
-
-						addInList(orcidList, handledOname);
-						addInList(orcidList, handledOsurname);
-
-						addInList(paperList, handledSurname);
-						addInList(paperList, handledName);
-
-						if (orcidList.size() <= paperList.size()) {
-							if (searchIn(paperList, orcidList)) {
-								return new Tuple2<>("check", reportInfo);
-							}
-						} else {
-							if (searchIn(orcidList, paperList)) {
-								return new Tuple2<>("check", reportInfo);
-							}
-						}
-
-						// todo add another level of checking (use levenstein)
-						return new Tuple2<>("wrong", reportInfo);
-					}
-					return new Tuple2<>("right", reportInfo);
-				}
-
-				return new Tuple2<>("right", reportInfo);
+				return getStringReportInfoFuzzyTuple2(pair);

 			}, Encoders.tuple(Encoders.STRING(), Encoders.bean(ReportInfo.class)))
 			.filter(Objects::nonNull);
@ -222,6 +196,280 @@ public class MakeReportSparkJob implements Serializable {

 	}

+	private static double fuzzyMatch(String orcid, String result) {
+		// apply one or more fuzzy functions to determine if the input string match
+		// match 1.0 con fuzzy => giusti
+		// quelli che matchano sopra 0.66 con fuzzy li metto fra i giusti
+		// quelli che non stanno nel match di prima, ma matchano fuzzywizzy sopra 0.5 li metto in check
+		// probabilmente giusti
+		// quelli che matchano fuzzywizzy da 0.5 a 0.3
+		return 0;
+
+	}
+
+	public static Tuple2<String, ReportInfo> getStringReportInfoFuzzyTuple2(
+		Tuple2<ResultInfo, OrcidAuthotitative> pair) {
+		Optional<OrcidAuthotitative> ooa = Optional.ofNullable(pair._2());
+		if (!ooa.isPresent()) {
+			return null;
+		}
+		OrcidAuthotitative oa = ooa.get();
+
+		ResultInfo ri = pair._1();
+
+		if (StringUtils.isBlank(ri.getSurname())) {
+			PacePerson pp = new PacePerson(ri.getFullname(), false);
+			ri.setSurname(pp.getNormalisedSurname());
+			ri.setName(pp.getNormalisedFirstName());
+		}
+		ReportInfo reportInfo = new ReportInfo();
+		reportInfo.setOid(oa.getOid());
+		reportInfo.setOname(oa.getName());
+		reportInfo.setOsurname(oa.getSurname());
+		reportInfo.setOcreditname(oa.getCreditName());
+		reportInfo.setAssociatedAuthors(Arrays.asList(ri));
+
+		int level = 1;
+
+		if (!Optional.ofNullable(oa.getSurname()).isPresent()) {
+			return new Tuple2<>("missing", reportInfo);
+		}
+		final String handledOsurname = handleNameSurname(oa.getSurname());
+
+		if (handledOsurname.equalsIgnoreCase("")) {
+			return new Tuple2<>("missing", reportInfo);
+		}
+		final String handledSurname = handleNameSurname(ri.getSurname());
+
+		if (handledSurname.equals("")) {
+			return new Tuple2<>("missing", reportInfo);
+		}
+
+		String handledOname = "";
+		if (Optional.ofNullable(oa.getName()).isPresent()) {
+			handledOname = handleNameSurname(oa.getName());
+		}
+		String handledName = "";
+		if (Optional.ofNullable(ri.getName()).isPresent()) {
+			handledName = handleNameSurname(ri.getName());
+		}
+
+		String[][] input = {
+			{
+				"1", handledOsurname + " " + handledOname
+			},
+			{
+				"2", handledSurname + " " + handledName
+			}
+		};
+		// check if there is neither a common word. If there is not they are obviously wrong
+		if (Math.round((1 - new CosineDistance().apply(input[0][1], input[1][1])) * 100) == 0) {
+			MatchService matchService = new MatchService();
+
+			List<Document> documentList = Arrays.asList(input).stream().map(contact -> {
+				return new Document.Builder(contact[0])
+					.addElement(
+						new Element.Builder<String>()
+							.setValue(contact[1])
+							.setType(ElementType.NAME)
+							.createElement())
+					.createDocument();
+			}).collect(Collectors.toList());
+			if (matchService.applyMatchByDocId(documentList).entrySet().size() == 0) {
+				if (FuzzySearch.ratio(input[0][1], input[1][1]) < 30) {
+					return new Tuple2<>("wrong", reportInfo);
+				}
+
+			}
+
+		}
+
+//		// they have some words in common. check if orcid provides creditName or otherNames to check for distance
+//		//
+//		List<Document> documentList = Arrays.asList(input).stream().map(contact -> {
+//			return new Document.Builder(contact[0])
+//				.addElement(
+//					new Element.Builder<String>()
+//						.setValue(contact[1])
+//						.setType(ElementType.NAME)
+//						.createElement())
+//				.createDocument();
+//		}).collect(Collectors.toList());
+//
+//		MatchService matchService = new MatchService();
+//
+//		Map<String, List<Match<Document>>> result = matchService.applyMatchByDocId(documentList);
+//
+//		if (result.entrySet().size() > 0) {
+//			reportInfo.setLevel("fuzzyMatch");
+//			return new Tuple2<>("right", reportInfo);
+//		}
+
+		return new Tuple2<>("check", reportInfo);
+	}
+
+	public static Tuple2<String, ReportInfo> getStringReportInfoTuple2(Tuple2<ResultInfo, OrcidAuthotitative> pair) {
+		Optional<OrcidAuthotitative> ooa = Optional.ofNullable(pair._2());
+		if (!ooa.isPresent()) {
+			return null;
+		}
+		OrcidAuthotitative oa = ooa.get();
+
+		ResultInfo ri = pair._1();
+
+		if (StringUtils.isBlank(ri.getSurname())) {
+			PacePerson pp = new PacePerson(ri.getFullname(), false);
+			ri.setSurname(pp.getNormalisedSurname());
+			ri.setName(pp.getNormalisedFirstName());
+		}
+		ReportInfo reportInfo = new ReportInfo();
+		reportInfo.setOid(oa.getOid());
+		reportInfo.setOname(oa.getName());
+		reportInfo.setOsurname(oa.getSurname());
+		reportInfo.setOcreditname(oa.getCreditName());
+		reportInfo.setAssociatedAuthors(Arrays.asList(ri));
+
+		int level = 1;
+
+		if (!Optional.ofNullable(oa.getSurname()).isPresent()) {
+			return new Tuple2<>("missing", reportInfo);
+		}
+		final String handledOsurname = handleNameSurname(oa.getSurname());
+
+		if (handledOsurname.equalsIgnoreCase("")) {
+			return new Tuple2<>("missing", reportInfo);
+		}
+		final String handledSurname = handleNameSurname(ri.getSurname());
+
+		if (handledSurname.equals("")) {
+			return new Tuple2<>("missing", reportInfo);
+		}
+
+		// check if oSurname and surname are equals
+		if (handledOsurname.equals(handledSurname)) {
+			reportInfo.setLevel("level" + level);
+			return new Tuple2<>("right", reportInfo);
+		}
+		level++;
+
+		// check if one is contained in the other
+		if (handledOsurname.contains(handledSurname) || handledSurname.contains(handledOsurname)) {
+			reportInfo.setLevel("level" + level);
+			return new Tuple2<>("right", reportInfo);
+		}
+		level++;
+		// check if one of the two is composed of more than one word. In this case concatenate the two words
+		// and check again (Mohammadi Peyhani vs Mohammadipeyhani)
+		String[] handledorcidSplit = handledOsurname.split(" ");
+		String[] handledresultSplit = handledSurname.split(" ");
+		if (handledorcidSplit.length == 2) {
+			String tmpSurname = handledorcidSplit[0] + handledorcidSplit[1];
+			if (tmpSurname.equals(handledSurname)) {
+				reportInfo.setLevel("level" + level);
+				return new Tuple2<>("check", reportInfo);
+			}
+		}
+		if (handledresultSplit.length == 2) {
+			String tmpSurname = handledresultSplit[0] + handledresultSplit[1];
+			if (tmpSurname.equals(handledSurname)) {
+				reportInfo.setLevel("level" + level);
+				return new Tuple2<>("check", reportInfo);
+			}
+		}
+		level++;
+		// check if the words composing the name and the surname are the same or one list contains the
+		// other.
+		// do for words of lenght bigger than two
+		String handledOname = "";
+		if (Optional.ofNullable(oa.getName()).isPresent()) {
+			handledOname = handleNameSurname(oa.getName());
+		}
+		String handledName = "";
+		if (Optional.ofNullable(ri.getName()).isPresent()) {
+			handledName = handleNameSurname(ri.getName());
+		}
+
+		final List<String> orcidList = new ArrayList<>();
+		final List<String> paperList = new ArrayList<>();
+
+		addInList(orcidList, handledOname);
+		addInList(orcidList, handledOsurname);
+
+		addInList(paperList, handledSurname);
+		addInList(paperList, handledName);
+
+		if (checkListContainment(reportInfo, level, orcidList, paperList))
+			return new Tuple2<>("check", reportInfo);
+		level++;
+
+		handledorcidSplit = handledOsurname.split(" ");
+		handledresultSplit = handledName.split(" ");
+
+		if (handledorcidSplit.length == 2) {
+			orcidList.clear();
+			orcidList.add(handledorcidSplit[0] + handledorcidSplit[1]);
+			addInList(orcidList, handledOname);
+			if (checkListContainment(reportInfo, level, orcidList, paperList)) {
+				return new Tuple2<>("check", reportInfo);
+			}
+			orcidList.clear();
+			orcidList.add(handledorcidSplit[1] + handledorcidSplit[0]);
+			addInList(orcidList, handledOname);
+			if (checkListContainment(reportInfo, level, orcidList, paperList)) {
+				return new Tuple2<>("check", reportInfo);
+			}
+		}
+		if (handledresultSplit.length == 2) {
+			orcidList.clear();
+			addInList(orcidList, handledOname);
+			addInList(orcidList, handledOsurname);
+			paperList.clear();
+			paperList.add(handledresultSplit[0] + handledresultSplit[1]);
+			addInList(paperList, handledSurname);
+			if (checkListContainment(reportInfo, level, orcidList, paperList))
+				return new Tuple2<>("check", reportInfo);
+			paperList.clear();
+			paperList.add(handledresultSplit[1] + handledresultSplit[0]);
+			addInList(paperList, handledSurname);
+			if (checkListContainment(reportInfo, level, orcidList, paperList))
+				return new Tuple2<>("check", reportInfo);
+		}
+		level++;
+
+		if (handledOsurname.length() > 3 && handledSurname.length() > 3) {
+			LevenshteinDistance l = new LevenshteinDistance();
+			if (l.apply(handledOsurname, handledSurname) <= 2) {
+				reportInfo.setLevel("level" + level);
+				return new Tuple2<>("check", reportInfo);
+			}
+		}
+		if (handledOsurname.length() > 3 && handledName.length() > 3) {
+			LevenshteinDistance l = new LevenshteinDistance();
+			if (l.apply(handledOsurname, handledName) <= 2) {
+				reportInfo.setLevel("level" + level);
+				return new Tuple2<>("check", reportInfo);
+			}
+		}
+
+		return new Tuple2<>("wrong", reportInfo);
+	}
+
+	private static boolean checkListContainment(ReportInfo reportInfo, int level, List<String> orcidList,
+		List<String> paperList) {
+		if (orcidList.size() <= paperList.size()) {
+			if (searchIn(paperList, orcidList)) {
+				reportInfo.setLevel("level" + level);
+				return true;
+			}
+		} else {
+			if (searchIn(orcidList, paperList)) {
+				reportInfo.setLevel("level" + level);
+				return true;
+			}
+		}
+		return false;
+	}
+
 	/**
 	 * searches in list1 all the words of list 2
 	 * @param list1 the list where to search for the words
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/authorpids/OrcidAuthotitative.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/authorpids/OrcidAuthotitative.java
@ -2,19 +2,47 @@
 package eu.dnetlib.dhp.oa.graph.clean.authorpids;

 import java.io.Serializable;
+import java.util.List;

 public class OrcidAuthotitative implements Serializable {
 	private String oid;
 	private String name;
 	private String surname;
-	private String creditname;
+	private String creditName;
+	private String otherName;
+	private List<String> otherNames;
+	private String errorCode;

-	public String getCreditname() {
-		return creditname;
+	public String getOtherName() {
+		return otherName;
 	}

-	public void setCreditname(String creditname) {
-		this.creditname = creditname;
+	public void setOtherName(String otherName) {
+		this.otherName = otherName;
+	}
+
+	public List<String> getOtherNames() {
+		return otherNames;
+	}
+
+	public void setOtherNames(List<String> otherNames) {
+		this.otherNames = otherNames;
+	}
+
+	public String getErrorCode() {
+		return errorCode;
+	}
+
+	public void setErrorCode(String errorCode) {
+		this.errorCode = errorCode;
+	}
+
+	public String getCreditName() {
+		return creditName;
+	}
+
+	public void setCreditName(String creditName) {
+		this.creditName = creditName;
 	}

 	public String getOid() {
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/authorpids/PrepareResultsSparkJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/authorpids/PrepareResultsSparkJob.java
@ -73,7 +73,7 @@ public class PrepareResultsSparkJob implements Serializable {

 		result.createOrReplaceTempView("result");

-		String query = "select auth.name name, auth.surname surname, auth.fullname fullname, pIde.value orcid, id, cf.value collectedfrom"
+		String query = "select auth.name name, auth.surname surname, auth.fullname fullname, pIde.value orcid, id, cf.value collectedfrom "
 			+
 			"from result " +
 			"lateral view explode(author) a as auth " +
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/authorpids/ReportInfo.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/authorpids/ReportInfo.java
@ -12,6 +12,16 @@ public class ReportInfo implements Serializable {

 	private List<ResultInfo> associatedAuthors;

+	private String level;
+
+	public String getLevel() {
+		return level;
+	}
+
+	public void setLevel(String level) {
+		this.level = level;
+	}
+
 	public String getOid() {
 		return oid;
 	}
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/oozie_app/workflow.xml
@ -404,7 +404,7 @@
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
            <arg>--preparedInfoPath</arg><arg>${workingDir}/dataset</arg>
-            <arg>--outputPath</arg><arg>${utputPath}/dataset</arg>
+            <arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
            <arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
            <arg>--orcidInputPath</arg><arg>${orcidInputPath}</arg>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/oozie_app/import.txt
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/oozie_app/import.txt
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/oozie_app/workflow.xml
@ -182,7 +182,7 @@
    <decision name="cleanorreport">
        <switch>
            <case to="make_report">${wf:conf('clean') eq false}</case>
-            <case to="clean_orcid_copy">${wf:conf('clean') eq true}</case>
+            <case to="clean_orcid">${wf:conf('clean') eq true}</case>
            <default to="make_report"/>
        </switch>
    </decision>
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleanOrcidTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleanOrcidTest.java
@ -1,11 +1,19 @@

 package eu.dnetlib.dhp.oa.graph.clean;

-import java.io.IOException;
+import java.io.*;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.function.Function;
+import java.util.stream.Collectors;

 import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.text.similarity.CosineDistance;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
 import org.apache.spark.SparkConf;
@ -21,8 +29,19 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.gson.Gson;
+import com.intuit.fuzzymatcher.component.MatchService;
+import com.intuit.fuzzymatcher.domain.Document;
+import com.intuit.fuzzymatcher.domain.Element;
+import com.intuit.fuzzymatcher.domain.ElementType;
+import com.intuit.fuzzymatcher.domain.Match;
+import com.wcohen.ss.Levenstein;

 import eu.dnetlib.dhp.oa.graph.clean.authorpids.*;
+import jdk.nashorn.internal.ir.annotations.Ignore;
+import me.xdrop.fuzzywuzzy.FuzzySearch;
+import net.sf.saxon.trans.Maker;
+import scala.Tuple2;

 public class CleanOrcidTest {

@ -34,6 +53,106 @@ public class CleanOrcidTest {

 	private static final Logger log = LoggerFactory.getLogger(CleanOrcidTest.class);

+	// needed for fuzzywuzzy to get a lower bound ratio under which the authors are most probably different
+	String[][] wrong = {
+		{
+			"1", MakeReportSparkJob.handleNameSurname("Alex Bullock")
+		},
+		{
+			"2", MakeReportSparkJob.handleNameSurname("Gillian Farnie")
+		},
+		{
+			"3", MakeReportSparkJob.handleNameSurname("Luís Rocha")
+		},
+		{
+			"4", MakeReportSparkJob.handleNameSurname("Pedro Relvas")
+		},
+
+		{
+			"9", MakeReportSparkJob.handleNameSurname("Prasanth Manohar")
+		},
+		{
+			"10", MakeReportSparkJob.handleNameSurname("Nachimuthu	Ramesh")
+		}
+
+	};
+
+	String[][] input = {
+		{
+			"1", MakeReportSparkJob.handleNameSurname("Dr. Ulrike Elsdoerfer Ph.D.")
+		},
+		{
+			"2", MakeReportSparkJob.handleNameSurname("Ulrike Elsdörfer")
+		},
+		{
+			"3", MakeReportSparkJob.handleNameSurname("Steven Ossont")
+		},
+		{
+			"4", MakeReportSparkJob.handleNameSurname("Steven J. Johnston")
+		},
+		{
+			"5", MakeReportSparkJob.handleNameSurname("Joanna Molyn")
+		},
+		{
+			"6", MakeReportSparkJob.handleNameSurname("Joanna Molyn-Blanchfield")
+		},
+		{
+			"7", MakeReportSparkJob.handleNameSurname("Zhang Tian-Tuo")
+		},
+		{
+			"8", MakeReportSparkJob.handleNameSurname("Zhang Tiantuo")
+		},
+		{
+			"9", MakeReportSparkJob.handleNameSurname("Prasanth Manohar")
+		},
+		{
+			"10", MakeReportSparkJob.handleNameSurname("Nachimuthu	Ramesh")
+		},
+		{
+			"9", MakeReportSparkJob.handleNameSurname("Hassan Ahmed")
+		},
+		{
+			"10", MakeReportSparkJob.handleNameSurname("Hassan Mohamed")
+		},
+		{
+			"11", MakeReportSparkJob.handleNameSurname("Jonathan ODonnell")
+		},
+		{
+			"12", MakeReportSparkJob.handleNameSurname("Jonathon A. O Dannell")
+		},
+		{
+			"11", MakeReportSparkJob.handleNameSurname("Amilcar António Teiga Teixeira")
+		},
+		{
+			"12", MakeReportSparkJob.handleNameSurname("Amílcar Teixeira")
+		},
+		{
+			"13", MakeReportSparkJob.handleNameSurname("Bruno Rossion")
+		},
+		{
+			"14", MakeReportSparkJob.handleNameSurname("B. Rossion")
+		},
+		{
+			"15", MakeReportSparkJob.handleNameSurname("TINGYOU	WANG")
+		},
+		{
+			"16", MakeReportSparkJob.handleNameSurname("Wang Ting-You")
+		},
+		{
+			"17", MakeReportSparkJob.handleNameSurname("Jacob	Moran-Gilad")
+		},
+		{
+			"18", MakeReportSparkJob.handleNameSurname("Moran-Gilad Jacon")
+		},
+		{
+			"19", MakeReportSparkJob.handleNameSurname("Adelle Semmler")
+		},
+		{
+			"20", MakeReportSparkJob.handleNameSurname("Adelle Craig")
+		}
+
+	};
+
 	@BeforeAll
 	public static void beforeAll() throws IOException {
 		workingDir = Files.createTempDirectory(CleanOrcidTest.class.getSimpleName());
@ -168,4 +287,151 @@ public class CleanOrcidTest {
 			.map(item -> OBJECT_MAPPER.readValue(item, ResultInfo.class));
 	}

+	@Test
+	public void cleanNameSurname() {
+		String name = "Hübner";
+		String surname = "Hubenr";
+
+		name = StringUtils
+			.stripAccents(name.toLowerCase().trim())
+			.replaceAll("[^a-z\\s]+", " ");
+
+		surname = StringUtils
+			.stripAccents(surname.toLowerCase().trim())
+			.replace(".", "")
+			.replaceAll("[^a-z\\s]+", " ")
+			.replace("'", " ")
+			.trim();
+
+		Levenstein l = new Levenstein();
+		double score = Math.abs(l.score(name, surname));
+
+		System.out.println(score);
+
+	}
+
+	@Test
+	public void testMakeReport() {
+		ResultInfo ri = new ResultInfo();
+		ri.setName("Prasanth");
+		ri.setSurname("Manohar");
+
+		OrcidAuthotitative oa = new OrcidAuthotitative();
+		oa.setName("Nachimuthu");
+		oa.setSurname("Ramesh");
+
+		Tuple2<ResultInfo, OrcidAuthotitative> t2 = new Tuple2<ResultInfo, OrcidAuthotitative>(ri, oa);
+		Tuple2<String, ReportInfo> tmp = MakeReportSparkJob.getStringReportInfoFuzzyTuple2(t2);
+
+		System.out.println(new Gson().toJson(tmp._2(), ReportInfo.class));
+	}
+
+	@Test
+	public void cosineDistanceTest() {
+
+		for (int i = 0; i < input.length; i += 2) {
+			double cosineDistance = new CosineDistance().apply(input[i][1], input[i + 1][1]);
+			System.out
+				.println(
+					"CosineDistance of '" + input[i][1] + "' & '" + input[i + 1][1] + "' | Words in strings are "
+						+ Math.round(cosineDistance * 100) + "% dis-similar or "
+						+ Math.round((1 - cosineDistance) * 100) + "% similar.");
+
+		}
+	}
+
+	@Test
+	public void testAuthorFuzzyMatch() {
+
+		Function<String, String> clean = s -> MakeReportSparkJob.handleNameSurname(s);
+
+		List<Document> documentList = Arrays.asList(input).stream().map(contact -> {
+			return new Document.Builder(contact[0])
+				.addElement(
+					new Element.Builder<String>()
+						.setValue(contact[1])
+						.setType(ElementType.NAME)
+						.setPreProcessingFunction(clean)
+						.createElement())
+				.createDocument();
+		}).collect(Collectors.toList());
+
+		MatchService matchService = new MatchService();
+
+		Map<String, List<Match<Document>>> result = matchService.applyMatchByDocId(documentList);
+
+		result.entrySet().forEach(entry -> {
+			entry.getValue().forEach(match -> {
+				System.out
+					.println(
+						"Data: " + match.getData() + " Matched With: " + match.getMatchedWith() + " Score: "
+							+ match.getScore().getResult());
+			});
+		});
+	}
+
+	@Test
+	public void FuzzyWuzzyTest() {
+		applyFuzzyWuzzy(input);
+	}
+
+	@Test
+	public void FuzzyWuzzyWrongTest() throws IOException {
+		final String inputPath = getClass()
+			.getResource("/eu/dnetlib/dhp/oa/graph/clean/wrongassociation.json")
+			.getPath();
+
+		BufferedReader reader = new BufferedReader(new FileReader(inputPath));
+		String line;
+		List<OrcidAuthor> orcidAuthorList = new ArrayList<>();
+		while (null != (line = reader.readLine())) {
+			orcidAuthorList.add(new Gson().fromJson(line, OrcidAuthor.class));
+		}
+
+		applyFuzzyWuzzy(orcidAuthorList);
+	}
+
+	private void applyFuzzyWuzzy(List<OrcidAuthor> orcidAuthorList) {
+		orcidAuthorList.forEach(entry -> {
+			String orcid = MakeReportSparkJob.handleNameSurname(entry.getOrcid());
+			String result = MakeReportSparkJob.handleNameSurname(entry.getResult());
+			System.out
+				.println(
+					"FuzzyWuzzy of  '" + orcid + "' & '" + result + "' | Similarity ratio  "
+						+ FuzzySearch.ratio(orcid, result));
+		});
+	}
+
+	private void applyFuzzyWuzzy(String[][] input) {
+		for (int i = 0; i < input.length; i += 2) {
+			System.out
+				.println(
+					"FuzzyWuzzy of  '" + input[i][1] + "' & '" + input[i + 1][1] + "' | Similarity ratio  "
+						+ FuzzySearch.ratio(input[i][1], input[i + 1][1]));
+
+		}
+	}
+
+	class OrcidAuthor implements Serializable {
+
+		private String orcid;
+		private String result;
+
+		public String getOrcid() {
+			return orcid;
+		}
+
+		public void setOrcid(String orcid) {
+			this.orcid = orcid;
+		}
+
+		public String getResult() {
+			return result;
+		}
+
+		public void setResult(String result) {
+			this.result = result;
+		}
+	}
+
 }
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/wrongassociation.json
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/wrongassociation.json
@ -0,0 +1,35 @@
+{"orcid":"Alex Bullock" ,"result": "Gillian Farnie"}
+{"orcid": "Luís Rocha", "result":"Pedro Relvas"}
+{"orcid": "Prasanth Manohar", "result": "Nachimuthu	Ramesh"}
+{"orcid": "Zhiying Lin", "result":"Guanglong Huang"}
+{"orcid":"Andrew Golnar","result":"Kim Pepin"}
+{"orcid": "Gilles Marcou", "result":"Filippo Lunghini"}
+{"orcid": "Philip Hahn", "result":"John Maron"}
+{"orcid": "Kirsty Gibson", "result":"Kim R. Hardie"}
+{"orcid": "Paula Lago", "result":"Shingo Takeda"}
+{"orcid": "Paul Seidler", "result":"Dalziel J. Wilson"}
+{"orcid": "Solomon Okunade", "result":"Rufus Adebayo Ajisafe"}
+{"orcid": "Emi Arai", "result":"Masaru Hasegawa"}
+{"orcid": "Dr Muhammad Yameen Sandhu", "result":"Nutapong Somjit"}
+{"orcid": "Xianlei Cai", "result":"Weiming Yu"}
+{"orcid": "Bing He", "result":"Chuan Xing"}
+{"orcid": "JULIEN COURCHET", "result":"Franck Polleux"}
+{"orcid": "Xiaoyun Pan", "result":"Liru Chen"}
+{"orcid": "Marianne Okal", "result":"Brendan Hodge"}
+{"orcid": "Michal Fereczkowski", "result":"Silje Grini Nielsen"}
+{"orcid": "Nobuyuki Nakai", "result":"Tadafumi Kurogi"}
+{"orcid": "Colin Daniel", "result":"Christine Cuyler"}
+{"orcid": "Xavier Arnan", "result":"Anna Torné-Noguera"}
+{"orcid": "Denita Hadziabdic", "result":"Meher Ony"}
+{"orcid": "Kor de Jong", "result":"K. Koning"}
+{"orcid": "Chaya Patel", "result":"David Leib"}
+{"orcid": "Fagner Carniel", "result":"Adonai Lacruz"}
+{"orcid": "Carrie Peltz", "result":"Erica Kornblith"}
+{"orcid": "Kathryn Huyvaert", "result":"Larissa L. Bailey"}
+{"orcid": "Christine Provost", "result":"Nathalie Sennéchael"}
+{"orcid": "Nancy Pachana", "result":"Lisa DiNatale"}
+{"orcid": "ARDESHIR BAYAT", "result":"P. Marcos Gorresen"}
+{"orcid": "Paul Berkowitz", "result":"Silje Grini Nielsen"}
+{"orcid": "Alice Laciny", "result":"Brian Metscher"}
+{"orcid": "Octavio Rojas", "result":"Josie A. Griffin"}
+{"orcid": "Carlo Sandroni", "result":"Riccardo Scattolini"}