This commit is contained in:
Miriam Baglioni 2020-11-16 10:53:12 +01:00
parent 0f1a4f6637
commit c29d142087
10 changed files with 696 additions and 88 deletions

View File

@ -47,6 +47,12 @@
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId> <artifactId>commons-compress</artifactId>
</dependency> </dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-text -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-text</artifactId>
<version>1.9</version>
</dependency>
<dependency> <dependency>
<groupId>commons-io</groupId> <groupId>commons-io</groupId>
@ -123,6 +129,21 @@
<version>2.4.0.cloudera2</version> <version>2.4.0.cloudera2</version>
</dependency> </dependency>
<!-- https://mvnrepository.com/artifact/me.xdrop/fuzzywuzzy -->
<dependency>
<groupId>me.xdrop</groupId>
<artifactId>fuzzywuzzy</artifactId>
<version>1.3.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.intuit.fuzzymatcher/fuzzy-matcher -->
<dependency>
<groupId>com.intuit.fuzzymatcher</groupId>
<artifactId>fuzzy-matcher</artifactId>
<version>1.0.4</version>
</dependency>
</dependencies> </dependencies>

View File

@ -5,9 +5,12 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.io.Serializable; import java.io.Serializable;
import java.util.*; import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.CosineDistance;
import org.apache.commons.text.similarity.LevenshteinDistance;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
@ -23,13 +26,49 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.intuit.fuzzymatcher.component.MatchService;
import com.intuit.fuzzymatcher.domain.Document;
import com.intuit.fuzzymatcher.domain.Element;
import com.intuit.fuzzymatcher.domain.ElementType;
import com.intuit.fuzzymatcher.domain.Match;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.common.PacePerson; import eu.dnetlib.dhp.common.PacePerson;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import me.xdrop.fuzzywuzzy.FuzzySearch;
import scala.Tuple2; import scala.Tuple2;
/**
* It checks if the orcid provided by ORCID and the one found in the result have the same author information. The
* author information is handled before the checking. Handling steps:
* words are lower-cased and trimmed, accents are replaced with their equivalent not accented. Only alfabethical
* characters and white space are retained. All the other chars are substituted with space.
*
* The check is made on different specification levels:
*
* Level1: orcid author surname and result author surname are identical. We consider the match to be right
*
* Level2: we verify if orcid author surname contains result author surname or vice versa. If it is the case we consider
* the match to be right
*
* Level3: we verify if one of the two surnames is composed by two words. In that case we concatenate the words and do
* the checking again. If the two match, we consider the match to be checked
*
* Level4: name and surname can be inverted in one of the two entities. We consider the set of words composing the name
* and the surname that are longer than 2 for orcid and result. If all the words of the shorter list are contained in
* the longer one, we consider the match to be checked
*
* Level5: name and surname are inverted but one of the two is composed by two words. Mix of Level3 and level4. We consider
* the match to be checked
*
* Level6: surnames differ for some chars. We apply the levenstein distance on surnames if their lenght is bigger than 3.
* If the distance is less than 2 we consider the match to be checked
*
* In all the other cases the match is considered wrong
*
*/
public class MakeReportSparkJob implements Serializable { public class MakeReportSparkJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(MakeReportSparkJob.class); private static final Logger log = LoggerFactory.getLogger(MakeReportSparkJob.class);
@ -105,13 +144,25 @@ public class MakeReportSparkJob implements Serializable {
private static void addInList(List<String> list, String to_add) { private static void addInList(List<String> list, String to_add) {
for (String word : to_add.split(" ")) { for (String word : to_add.split(" ")) {
if (word.length() >= 2) { if (word.length() > 2) {
list.add(word); list.add(word);
} }
} }
} }
public static String handleNameSurname(String input) {
input = input.toLowerCase().replace(".", "");
if (input.startsWith("dr")) {
input = input.substring(3);
}
return StringUtils
.stripAccents(input.trim())
.replaceAll("[^a-z\\s]+", " ")
.trim();
}
private static <I extends Result> void makeReport(SparkSession spark, String inputPath, Class<I> entityClazz, private static <I extends Result> void makeReport(SparkSession spark, String inputPath, Class<I> entityClazz,
String outputPath, String preparedInfoPath, String outputPath, String preparedInfoPath,
Dataset<OrcidAuthotitative> authoritative) { Dataset<OrcidAuthotitative> authoritative) {
@ -125,84 +176,7 @@ public class MakeReportSparkJob implements Serializable {
.equalTo(resultInfo.col("orcid")), .equalTo(resultInfo.col("orcid")),
"left") "left")
.map((MapFunction<Tuple2<ResultInfo, OrcidAuthotitative>, Tuple2<String, ReportInfo>>) pair -> { .map((MapFunction<Tuple2<ResultInfo, OrcidAuthotitative>, Tuple2<String, ReportInfo>>) pair -> {
Optional<OrcidAuthotitative> ooa = Optional.ofNullable(pair._2()); return getStringReportInfoFuzzyTuple2(pair);
if (!ooa.isPresent()) {
return null;
}
OrcidAuthotitative oa = ooa.get();
ResultInfo ri = pair._1();
if (StringUtils.isBlank(ri.getSurname())) {
PacePerson pp = new PacePerson(ri.getFullname(), false);
ri.setSurname(pp.getNormalisedSurname());
ri.setName(pp.getNormalisedFirstName());
}
ReportInfo reportInfo = new ReportInfo();
reportInfo.setOid(oa.getOid());
reportInfo.setOname(oa.getName());
reportInfo.setOsurname(oa.getSurname());
reportInfo.setOcreditname(oa.getCreditname());
reportInfo.setAssociatedAuthors(Arrays.asList(ri));
if (!Optional.ofNullable(oa.getSurname()).isPresent()) {
return new Tuple2<>("missing", reportInfo);
}
final String handledOsurname = StringUtils
.stripAccents(oa.getSurname().toLowerCase().trim())
.replace("-", " ")
.replace(".", "");
final String handledSurname = StringUtils
.stripAccents(ri.getSurname().toLowerCase().trim())
.replace("-", " ")
.replace(".", "");
if (!handledOsurname
.equalsIgnoreCase(handledSurname)) {
if (!handledOsurname.contains(handledSurname) && !handledSurname.contains(handledOsurname)) {
// check if the words composing the name and the surname are the same or one list contains the
// other.
// do for words of lenght bigger than two
String handledOname = "";
if (Optional.ofNullable(oa.getName()).isPresent()) {
handledOname = StringUtils
.stripAccents(oa.getName().toLowerCase().trim())
.replace("-", " ")
.replace(".", "");
}
String handledName = "";
if (Optional.ofNullable(ri.getName()).isPresent()) {
handledName = StringUtils
.stripAccents(ri.getName().toLowerCase().trim())
.replace("-", " ")
.replace(".", "");
}
final List<String> orcidList = new ArrayList<>();
final List<String> paperList = new ArrayList<>();
addInList(orcidList, handledOname);
addInList(orcidList, handledOsurname);
addInList(paperList, handledSurname);
addInList(paperList, handledName);
if (orcidList.size() <= paperList.size()) {
if (searchIn(paperList, orcidList)) {
return new Tuple2<>("check", reportInfo);
}
} else {
if (searchIn(orcidList, paperList)) {
return new Tuple2<>("check", reportInfo);
}
}
// todo add another level of checking (use levenstein)
return new Tuple2<>("wrong", reportInfo);
}
return new Tuple2<>("right", reportInfo);
}
return new Tuple2<>("right", reportInfo);
}, Encoders.tuple(Encoders.STRING(), Encoders.bean(ReportInfo.class))) }, Encoders.tuple(Encoders.STRING(), Encoders.bean(ReportInfo.class)))
.filter(Objects::nonNull); .filter(Objects::nonNull);
@ -222,6 +196,280 @@ public class MakeReportSparkJob implements Serializable {
} }
private static double fuzzyMatch(String orcid, String result) {
// apply one or more fuzzy functions to determine if the input string match
// match 1.0 con fuzzy => giusti
// quelli che matchano sopra 0.66 con fuzzy li metto fra i giusti
// quelli che non stanno nel match di prima, ma matchano fuzzywizzy sopra 0.5 li metto in check
// probabilmente giusti
// quelli che matchano fuzzywizzy da 0.5 a 0.3
return 0;
}
public static Tuple2<String, ReportInfo> getStringReportInfoFuzzyTuple2(
Tuple2<ResultInfo, OrcidAuthotitative> pair) {
Optional<OrcidAuthotitative> ooa = Optional.ofNullable(pair._2());
if (!ooa.isPresent()) {
return null;
}
OrcidAuthotitative oa = ooa.get();
ResultInfo ri = pair._1();
if (StringUtils.isBlank(ri.getSurname())) {
PacePerson pp = new PacePerson(ri.getFullname(), false);
ri.setSurname(pp.getNormalisedSurname());
ri.setName(pp.getNormalisedFirstName());
}
ReportInfo reportInfo = new ReportInfo();
reportInfo.setOid(oa.getOid());
reportInfo.setOname(oa.getName());
reportInfo.setOsurname(oa.getSurname());
reportInfo.setOcreditname(oa.getCreditName());
reportInfo.setAssociatedAuthors(Arrays.asList(ri));
int level = 1;
if (!Optional.ofNullable(oa.getSurname()).isPresent()) {
return new Tuple2<>("missing", reportInfo);
}
final String handledOsurname = handleNameSurname(oa.getSurname());
if (handledOsurname.equalsIgnoreCase("")) {
return new Tuple2<>("missing", reportInfo);
}
final String handledSurname = handleNameSurname(ri.getSurname());
if (handledSurname.equals("")) {
return new Tuple2<>("missing", reportInfo);
}
String handledOname = "";
if (Optional.ofNullable(oa.getName()).isPresent()) {
handledOname = handleNameSurname(oa.getName());
}
String handledName = "";
if (Optional.ofNullable(ri.getName()).isPresent()) {
handledName = handleNameSurname(ri.getName());
}
String[][] input = {
{
"1", handledOsurname + " " + handledOname
},
{
"2", handledSurname + " " + handledName
}
};
// check if there is neither a common word. If there is not they are obviously wrong
if (Math.round((1 - new CosineDistance().apply(input[0][1], input[1][1])) * 100) == 0) {
MatchService matchService = new MatchService();
List<Document> documentList = Arrays.asList(input).stream().map(contact -> {
return new Document.Builder(contact[0])
.addElement(
new Element.Builder<String>()
.setValue(contact[1])
.setType(ElementType.NAME)
.createElement())
.createDocument();
}).collect(Collectors.toList());
if (matchService.applyMatchByDocId(documentList).entrySet().size() == 0) {
if (FuzzySearch.ratio(input[0][1], input[1][1]) < 30) {
return new Tuple2<>("wrong", reportInfo);
}
}
}
// // they have some words in common. check if orcid provides creditName or otherNames to check for distance
// //
// List<Document> documentList = Arrays.asList(input).stream().map(contact -> {
// return new Document.Builder(contact[0])
// .addElement(
// new Element.Builder<String>()
// .setValue(contact[1])
// .setType(ElementType.NAME)
// .createElement())
// .createDocument();
// }).collect(Collectors.toList());
//
// MatchService matchService = new MatchService();
//
// Map<String, List<Match<Document>>> result = matchService.applyMatchByDocId(documentList);
//
// if (result.entrySet().size() > 0) {
// reportInfo.setLevel("fuzzyMatch");
// return new Tuple2<>("right", reportInfo);
// }
return new Tuple2<>("check", reportInfo);
}
public static Tuple2<String, ReportInfo> getStringReportInfoTuple2(Tuple2<ResultInfo, OrcidAuthotitative> pair) {
Optional<OrcidAuthotitative> ooa = Optional.ofNullable(pair._2());
if (!ooa.isPresent()) {
return null;
}
OrcidAuthotitative oa = ooa.get();
ResultInfo ri = pair._1();
if (StringUtils.isBlank(ri.getSurname())) {
PacePerson pp = new PacePerson(ri.getFullname(), false);
ri.setSurname(pp.getNormalisedSurname());
ri.setName(pp.getNormalisedFirstName());
}
ReportInfo reportInfo = new ReportInfo();
reportInfo.setOid(oa.getOid());
reportInfo.setOname(oa.getName());
reportInfo.setOsurname(oa.getSurname());
reportInfo.setOcreditname(oa.getCreditName());
reportInfo.setAssociatedAuthors(Arrays.asList(ri));
int level = 1;
if (!Optional.ofNullable(oa.getSurname()).isPresent()) {
return new Tuple2<>("missing", reportInfo);
}
final String handledOsurname = handleNameSurname(oa.getSurname());
if (handledOsurname.equalsIgnoreCase("")) {
return new Tuple2<>("missing", reportInfo);
}
final String handledSurname = handleNameSurname(ri.getSurname());
if (handledSurname.equals("")) {
return new Tuple2<>("missing", reportInfo);
}
// check if oSurname and surname are equals
if (handledOsurname.equals(handledSurname)) {
reportInfo.setLevel("level" + level);
return new Tuple2<>("right", reportInfo);
}
level++;
// check if one is contained in the other
if (handledOsurname.contains(handledSurname) || handledSurname.contains(handledOsurname)) {
reportInfo.setLevel("level" + level);
return new Tuple2<>("right", reportInfo);
}
level++;
// check if one of the two is composed of more than one word. In this case concatenate the two words
// and check again (Mohammadi Peyhani vs Mohammadipeyhani)
String[] handledorcidSplit = handledOsurname.split(" ");
String[] handledresultSplit = handledSurname.split(" ");
if (handledorcidSplit.length == 2) {
String tmpSurname = handledorcidSplit[0] + handledorcidSplit[1];
if (tmpSurname.equals(handledSurname)) {
reportInfo.setLevel("level" + level);
return new Tuple2<>("check", reportInfo);
}
}
if (handledresultSplit.length == 2) {
String tmpSurname = handledresultSplit[0] + handledresultSplit[1];
if (tmpSurname.equals(handledSurname)) {
reportInfo.setLevel("level" + level);
return new Tuple2<>("check", reportInfo);
}
}
level++;
// check if the words composing the name and the surname are the same or one list contains the
// other.
// do for words of lenght bigger than two
String handledOname = "";
if (Optional.ofNullable(oa.getName()).isPresent()) {
handledOname = handleNameSurname(oa.getName());
}
String handledName = "";
if (Optional.ofNullable(ri.getName()).isPresent()) {
handledName = handleNameSurname(ri.getName());
}
final List<String> orcidList = new ArrayList<>();
final List<String> paperList = new ArrayList<>();
addInList(orcidList, handledOname);
addInList(orcidList, handledOsurname);
addInList(paperList, handledSurname);
addInList(paperList, handledName);
if (checkListContainment(reportInfo, level, orcidList, paperList))
return new Tuple2<>("check", reportInfo);
level++;
handledorcidSplit = handledOsurname.split(" ");
handledresultSplit = handledName.split(" ");
if (handledorcidSplit.length == 2) {
orcidList.clear();
orcidList.add(handledorcidSplit[0] + handledorcidSplit[1]);
addInList(orcidList, handledOname);
if (checkListContainment(reportInfo, level, orcidList, paperList)) {
return new Tuple2<>("check", reportInfo);
}
orcidList.clear();
orcidList.add(handledorcidSplit[1] + handledorcidSplit[0]);
addInList(orcidList, handledOname);
if (checkListContainment(reportInfo, level, orcidList, paperList)) {
return new Tuple2<>("check", reportInfo);
}
}
if (handledresultSplit.length == 2) {
orcidList.clear();
addInList(orcidList, handledOname);
addInList(orcidList, handledOsurname);
paperList.clear();
paperList.add(handledresultSplit[0] + handledresultSplit[1]);
addInList(paperList, handledSurname);
if (checkListContainment(reportInfo, level, orcidList, paperList))
return new Tuple2<>("check", reportInfo);
paperList.clear();
paperList.add(handledresultSplit[1] + handledresultSplit[0]);
addInList(paperList, handledSurname);
if (checkListContainment(reportInfo, level, orcidList, paperList))
return new Tuple2<>("check", reportInfo);
}
level++;
if (handledOsurname.length() > 3 && handledSurname.length() > 3) {
LevenshteinDistance l = new LevenshteinDistance();
if (l.apply(handledOsurname, handledSurname) <= 2) {
reportInfo.setLevel("level" + level);
return new Tuple2<>("check", reportInfo);
}
}
if (handledOsurname.length() > 3 && handledName.length() > 3) {
LevenshteinDistance l = new LevenshteinDistance();
if (l.apply(handledOsurname, handledName) <= 2) {
reportInfo.setLevel("level" + level);
return new Tuple2<>("check", reportInfo);
}
}
return new Tuple2<>("wrong", reportInfo);
}
private static boolean checkListContainment(ReportInfo reportInfo, int level, List<String> orcidList,
List<String> paperList) {
if (orcidList.size() <= paperList.size()) {
if (searchIn(paperList, orcidList)) {
reportInfo.setLevel("level" + level);
return true;
}
} else {
if (searchIn(orcidList, paperList)) {
reportInfo.setLevel("level" + level);
return true;
}
}
return false;
}
/** /**
* searches in list1 all the words of list 2 * searches in list1 all the words of list 2
* @param list1 the list where to search for the words * @param list1 the list where to search for the words

View File

@ -2,19 +2,47 @@
package eu.dnetlib.dhp.oa.graph.clean.authorpids; package eu.dnetlib.dhp.oa.graph.clean.authorpids;
import java.io.Serializable; import java.io.Serializable;
import java.util.List;
public class OrcidAuthotitative implements Serializable { public class OrcidAuthotitative implements Serializable {
private String oid; private String oid;
private String name; private String name;
private String surname; private String surname;
private String creditname; private String creditName;
private String otherName;
private List<String> otherNames;
private String errorCode;
public String getCreditname() { public String getOtherName() {
return creditname; return otherName;
} }
public void setCreditname(String creditname) { public void setOtherName(String otherName) {
this.creditname = creditname; this.otherName = otherName;
}
public List<String> getOtherNames() {
return otherNames;
}
public void setOtherNames(List<String> otherNames) {
this.otherNames = otherNames;
}
public String getErrorCode() {
return errorCode;
}
public void setErrorCode(String errorCode) {
this.errorCode = errorCode;
}
public String getCreditName() {
return creditName;
}
public void setCreditName(String creditName) {
this.creditName = creditName;
} }
public String getOid() { public String getOid() {

View File

@ -73,7 +73,7 @@ public class PrepareResultsSparkJob implements Serializable {
result.createOrReplaceTempView("result"); result.createOrReplaceTempView("result");
String query = "select auth.name name, auth.surname surname, auth.fullname fullname, pIde.value orcid, id, cf.value collectedfrom" String query = "select auth.name name, auth.surname surname, auth.fullname fullname, pIde.value orcid, id, cf.value collectedfrom "
+ +
"from result " + "from result " +
"lateral view explode(author) a as auth " + "lateral view explode(author) a as auth " +

View File

@ -12,6 +12,16 @@ public class ReportInfo implements Serializable {
private List<ResultInfo> associatedAuthors; private List<ResultInfo> associatedAuthors;
private String level;
public String getLevel() {
return level;
}
public void setLevel(String level) {
this.level = level;
}
public String getOid() { public String getOid() {
return oid; return oid;
} }

View File

@ -404,7 +404,7 @@
--conf spark.sql.shuffle.partitions=7680 --conf spark.sql.shuffle.partitions=7680
</spark-opts> </spark-opts>
<arg>--preparedInfoPath</arg><arg>${workingDir}/dataset</arg> <arg>--preparedInfoPath</arg><arg>${workingDir}/dataset</arg>
<arg>--outputPath</arg><arg>${utputPath}/dataset</arg> <arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg> <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--inputPath</arg><arg>${inputPath}/dataset</arg> <arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
<arg>--orcidInputPath</arg><arg>${orcidInputPath}</arg> <arg>--orcidInputPath</arg><arg>${orcidInputPath}</arg>

View File

@ -182,7 +182,7 @@
<decision name="cleanorreport"> <decision name="cleanorreport">
<switch> <switch>
<case to="make_report">${wf:conf('clean') eq false}</case> <case to="make_report">${wf:conf('clean') eq false}</case>
<case to="clean_orcid_copy">${wf:conf('clean') eq true}</case> <case to="clean_orcid">${wf:conf('clean') eq true}</case>
<default to="make_report"/> <default to="make_report"/>
</switch> </switch>
</decision> </decision>

View File

@ -1,11 +1,19 @@
package eu.dnetlib.dhp.oa.graph.clean; package eu.dnetlib.dhp.oa.graph.clean;
import java.io.IOException; import java.io.*;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.CosineDistance;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet; import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -21,8 +29,19 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson;
import com.intuit.fuzzymatcher.component.MatchService;
import com.intuit.fuzzymatcher.domain.Document;
import com.intuit.fuzzymatcher.domain.Element;
import com.intuit.fuzzymatcher.domain.ElementType;
import com.intuit.fuzzymatcher.domain.Match;
import com.wcohen.ss.Levenstein;
import eu.dnetlib.dhp.oa.graph.clean.authorpids.*; import eu.dnetlib.dhp.oa.graph.clean.authorpids.*;
import jdk.nashorn.internal.ir.annotations.Ignore;
import me.xdrop.fuzzywuzzy.FuzzySearch;
import net.sf.saxon.trans.Maker;
import scala.Tuple2;
public class CleanOrcidTest { public class CleanOrcidTest {
@ -34,6 +53,106 @@ public class CleanOrcidTest {
private static final Logger log = LoggerFactory.getLogger(CleanOrcidTest.class); private static final Logger log = LoggerFactory.getLogger(CleanOrcidTest.class);
// needed for fuzzywuzzy to get a lower bound ratio under which the authors are most probably different
String[][] wrong = {
{
"1", MakeReportSparkJob.handleNameSurname("Alex Bullock")
},
{
"2", MakeReportSparkJob.handleNameSurname("Gillian Farnie")
},
{
"3", MakeReportSparkJob.handleNameSurname("Luís Rocha")
},
{
"4", MakeReportSparkJob.handleNameSurname("Pedro Relvas")
},
{
"9", MakeReportSparkJob.handleNameSurname("Prasanth Manohar")
},
{
"10", MakeReportSparkJob.handleNameSurname("Nachimuthu Ramesh")
}
};
String[][] input = {
{
"1", MakeReportSparkJob.handleNameSurname("Dr. Ulrike Elsdoerfer Ph.D.")
},
{
"2", MakeReportSparkJob.handleNameSurname("Ulrike Elsdörfer")
},
{
"3", MakeReportSparkJob.handleNameSurname("Steven Ossont")
},
{
"4", MakeReportSparkJob.handleNameSurname("Steven J. Johnston")
},
{
"5", MakeReportSparkJob.handleNameSurname("Joanna Molyn")
},
{
"6", MakeReportSparkJob.handleNameSurname("Joanna Molyn-Blanchfield")
},
{
"7", MakeReportSparkJob.handleNameSurname("Zhang Tian-Tuo")
},
{
"8", MakeReportSparkJob.handleNameSurname("Zhang Tiantuo")
},
{
"9", MakeReportSparkJob.handleNameSurname("Prasanth Manohar")
},
{
"10", MakeReportSparkJob.handleNameSurname("Nachimuthu Ramesh")
},
{
"9", MakeReportSparkJob.handleNameSurname("Hassan Ahmed")
},
{
"10", MakeReportSparkJob.handleNameSurname("Hassan Mohamed")
},
{
"11", MakeReportSparkJob.handleNameSurname("Jonathan ODonnell")
},
{
"12", MakeReportSparkJob.handleNameSurname("Jonathon A. O Dannell")
},
{
"11", MakeReportSparkJob.handleNameSurname("Amilcar António Teiga Teixeira")
},
{
"12", MakeReportSparkJob.handleNameSurname("Amílcar Teixeira")
},
{
"13", MakeReportSparkJob.handleNameSurname("Bruno Rossion")
},
{
"14", MakeReportSparkJob.handleNameSurname("B. Rossion")
},
{
"15", MakeReportSparkJob.handleNameSurname("TINGYOU WANG")
},
{
"16", MakeReportSparkJob.handleNameSurname("Wang Ting-You")
},
{
"17", MakeReportSparkJob.handleNameSurname("Jacob Moran-Gilad")
},
{
"18", MakeReportSparkJob.handleNameSurname("Moran-Gilad Jacon")
},
{
"19", MakeReportSparkJob.handleNameSurname("Adelle Semmler")
},
{
"20", MakeReportSparkJob.handleNameSurname("Adelle Craig")
}
};
@BeforeAll @BeforeAll
public static void beforeAll() throws IOException { public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(CleanOrcidTest.class.getSimpleName()); workingDir = Files.createTempDirectory(CleanOrcidTest.class.getSimpleName());
@ -168,4 +287,151 @@ public class CleanOrcidTest {
.map(item -> OBJECT_MAPPER.readValue(item, ResultInfo.class)); .map(item -> OBJECT_MAPPER.readValue(item, ResultInfo.class));
} }
@Test
public void cleanNameSurname() {
String name = "Hübner";
String surname = "Hubenr";
name = StringUtils
.stripAccents(name.toLowerCase().trim())
.replaceAll("[^a-z\\s]+", " ");
surname = StringUtils
.stripAccents(surname.toLowerCase().trim())
.replace(".", "")
.replaceAll("[^a-z\\s]+", " ")
.replace("'", " ")
.trim();
Levenstein l = new Levenstein();
double score = Math.abs(l.score(name, surname));
System.out.println(score);
}
@Test
public void testMakeReport() {
ResultInfo ri = new ResultInfo();
ri.setName("Prasanth");
ri.setSurname("Manohar");
OrcidAuthotitative oa = new OrcidAuthotitative();
oa.setName("Nachimuthu");
oa.setSurname("Ramesh");
Tuple2<ResultInfo, OrcidAuthotitative> t2 = new Tuple2<ResultInfo, OrcidAuthotitative>(ri, oa);
Tuple2<String, ReportInfo> tmp = MakeReportSparkJob.getStringReportInfoFuzzyTuple2(t2);
System.out.println(new Gson().toJson(tmp._2(), ReportInfo.class));
}
@Test
public void cosineDistanceTest() {
for (int i = 0; i < input.length; i += 2) {
double cosineDistance = new CosineDistance().apply(input[i][1], input[i + 1][1]);
System.out
.println(
"CosineDistance of '" + input[i][1] + "' & '" + input[i + 1][1] + "' | Words in strings are "
+ Math.round(cosineDistance * 100) + "% dis-similar or "
+ Math.round((1 - cosineDistance) * 100) + "% similar.");
}
}
@Test
public void testAuthorFuzzyMatch() {
Function<String, String> clean = s -> MakeReportSparkJob.handleNameSurname(s);
List<Document> documentList = Arrays.asList(input).stream().map(contact -> {
return new Document.Builder(contact[0])
.addElement(
new Element.Builder<String>()
.setValue(contact[1])
.setType(ElementType.NAME)
.setPreProcessingFunction(clean)
.createElement())
.createDocument();
}).collect(Collectors.toList());
MatchService matchService = new MatchService();
Map<String, List<Match<Document>>> result = matchService.applyMatchByDocId(documentList);
result.entrySet().forEach(entry -> {
entry.getValue().forEach(match -> {
System.out
.println(
"Data: " + match.getData() + " Matched With: " + match.getMatchedWith() + " Score: "
+ match.getScore().getResult());
});
});
}
@Test
public void FuzzyWuzzyTest() {
applyFuzzyWuzzy(input);
}
@Test
public void FuzzyWuzzyWrongTest() throws IOException {
final String inputPath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/clean/wrongassociation.json")
.getPath();
BufferedReader reader = new BufferedReader(new FileReader(inputPath));
String line;
List<OrcidAuthor> orcidAuthorList = new ArrayList<>();
while (null != (line = reader.readLine())) {
orcidAuthorList.add(new Gson().fromJson(line, OrcidAuthor.class));
}
applyFuzzyWuzzy(orcidAuthorList);
}
private void applyFuzzyWuzzy(List<OrcidAuthor> orcidAuthorList) {
orcidAuthorList.forEach(entry -> {
String orcid = MakeReportSparkJob.handleNameSurname(entry.getOrcid());
String result = MakeReportSparkJob.handleNameSurname(entry.getResult());
System.out
.println(
"FuzzyWuzzy of '" + orcid + "' & '" + result + "' | Similarity ratio "
+ FuzzySearch.ratio(orcid, result));
});
}
private void applyFuzzyWuzzy(String[][] input) {
for (int i = 0; i < input.length; i += 2) {
System.out
.println(
"FuzzyWuzzy of '" + input[i][1] + "' & '" + input[i + 1][1] + "' | Similarity ratio "
+ FuzzySearch.ratio(input[i][1], input[i + 1][1]));
}
}
class OrcidAuthor implements Serializable {
private String orcid;
private String result;
public String getOrcid() {
return orcid;
}
public void setOrcid(String orcid) {
this.orcid = orcid;
}
public String getResult() {
return result;
}
public void setResult(String result) {
this.result = result;
}
}
} }

View File

@ -0,0 +1,35 @@
{"orcid":"Alex Bullock" ,"result": "Gillian Farnie"}
{"orcid": "Luís Rocha", "result":"Pedro Relvas"}
{"orcid": "Prasanth Manohar", "result": "Nachimuthu Ramesh"}
{"orcid": "Zhiying Lin", "result":"Guanglong Huang"}
{"orcid":"Andrew Golnar","result":"Kim Pepin"}
{"orcid": "Gilles Marcou", "result":"Filippo Lunghini"}
{"orcid": "Philip Hahn", "result":"John Maron"}
{"orcid": "Kirsty Gibson", "result":"Kim R. Hardie"}
{"orcid": "Paula Lago", "result":"Shingo Takeda"}
{"orcid": "Paul Seidler", "result":"Dalziel J. Wilson"}
{"orcid": "Solomon Okunade", "result":"Rufus Adebayo Ajisafe"}
{"orcid": "Emi Arai", "result":"Masaru Hasegawa"}
{"orcid": "Dr Muhammad Yameen Sandhu", "result":"Nutapong Somjit"}
{"orcid": "Xianlei Cai", "result":"Weiming Yu"}
{"orcid": "Bing He", "result":"Chuan Xing"}
{"orcid": "JULIEN COURCHET", "result":"Franck Polleux"}
{"orcid": "Xiaoyun Pan", "result":"Liru Chen"}
{"orcid": "Marianne Okal", "result":"Brendan Hodge"}
{"orcid": "Michal Fereczkowski", "result":"Silje Grini Nielsen"}
{"orcid": "Nobuyuki Nakai", "result":"Tadafumi Kurogi"}
{"orcid": "Colin Daniel", "result":"Christine Cuyler"}
{"orcid": "Xavier Arnan", "result":"Anna Torné-Noguera"}
{"orcid": "Denita Hadziabdic", "result":"Meher Ony"}
{"orcid": "Kor de Jong", "result":"K. Koning"}
{"orcid": "Chaya Patel", "result":"David Leib"}
{"orcid": "Fagner Carniel", "result":"Adonai Lacruz"}
{"orcid": "Carrie Peltz", "result":"Erica Kornblith"}
{"orcid": "Kathryn Huyvaert", "result":"Larissa L. Bailey"}
{"orcid": "Christine Provost", "result":"Nathalie Sennéchael"}
{"orcid": "Nancy Pachana", "result":"Lisa DiNatale"}
{"orcid": "ARDESHIR BAYAT", "result":"P. Marcos Gorresen"}
{"orcid": "Paul Berkowitz", "result":"Silje Grini Nielsen"}
{"orcid": "Alice Laciny", "result":"Brian Metscher"}
{"orcid": "Octavio Rojas", "result":"Josie A. Griffin"}
{"orcid": "Carlo Sandroni", "result":"Riccardo Scattolini"}