forked from D-Net/dnet-hadoop
This commit is contained in:
parent
0f1a4f6637
commit
c29d142087
|
@ -47,6 +47,12 @@
|
|||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-compress</artifactId>
|
||||
</dependency>
|
||||
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-text -->
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-text</artifactId>
|
||||
<version>1.9</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>commons-io</groupId>
|
||||
|
@ -123,6 +129,21 @@
|
|||
<version>2.4.0.cloudera2</version>
|
||||
</dependency>
|
||||
|
||||
<!-- https://mvnrepository.com/artifact/me.xdrop/fuzzywuzzy -->
|
||||
<dependency>
|
||||
<groupId>me.xdrop</groupId>
|
||||
<artifactId>fuzzywuzzy</artifactId>
|
||||
<version>1.3.1</version>
|
||||
</dependency>
|
||||
|
||||
<!-- https://mvnrepository.com/artifact/com.intuit.fuzzymatcher/fuzzy-matcher -->
|
||||
<dependency>
|
||||
<groupId>com.intuit.fuzzymatcher</groupId>
|
||||
<artifactId>fuzzy-matcher</artifactId>
|
||||
<version>1.0.4</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
||||
|
|
|
@ -5,9 +5,12 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
|||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.text.similarity.CosineDistance;
|
||||
import org.apache.commons.text.similarity.LevenshteinDistance;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
@ -23,13 +26,49 @@ import org.slf4j.Logger;
|
|||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.intuit.fuzzymatcher.component.MatchService;
|
||||
import com.intuit.fuzzymatcher.domain.Document;
|
||||
import com.intuit.fuzzymatcher.domain.Element;
|
||||
import com.intuit.fuzzymatcher.domain.ElementType;
|
||||
import com.intuit.fuzzymatcher.domain.Match;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.common.PacePerson;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import me.xdrop.fuzzywuzzy.FuzzySearch;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* It checks if the orcid provided by ORCID and the one found in the result have the same author information. The
|
||||
* author information is handled before the checking. Handling steps:
|
||||
* words are lower-cased and trimmed, accents are replaced with their equivalent not accented. Only alfabethical
|
||||
* characters and white space are retained. All the other chars are substituted with space.
|
||||
*
|
||||
* The check is made on different specification levels:
|
||||
*
|
||||
* Level1: orcid author surname and result author surname are identical. We consider the match to be right
|
||||
*
|
||||
* Level2: we verify if orcid author surname contains result author surname or vice versa. If it is the case we consider
|
||||
* the match to be right
|
||||
*
|
||||
* Level3: we verify if one of the two surnames is composed by two words. In that case we concatenate the words and do
|
||||
* the checking again. If the two match, we consider the match to be checked
|
||||
*
|
||||
* Level4: name and surname can be inverted in one of the two entities. We consider the set of words composing the name
|
||||
* and the surname that are longer than 2 for orcid and result. If all the words of the shorter list are contained in
|
||||
* the longer one, we consider the match to be checked
|
||||
*
|
||||
* Level5: name and surname are inverted but one of the two is composed by two words. Mix of Level3 and level4. We consider
|
||||
* the match to be checked
|
||||
*
|
||||
* Level6: surnames differ for some chars. We apply the levenstein distance on surnames if their lenght is bigger than 3.
|
||||
* If the distance is less than 2 we consider the match to be checked
|
||||
*
|
||||
* In all the other cases the match is considered wrong
|
||||
*
|
||||
*/
|
||||
|
||||
public class MakeReportSparkJob implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(MakeReportSparkJob.class);
|
||||
|
||||
|
@ -105,13 +144,25 @@ public class MakeReportSparkJob implements Serializable {
|
|||
|
||||
private static void addInList(List<String> list, String to_add) {
|
||||
for (String word : to_add.split(" ")) {
|
||||
if (word.length() >= 2) {
|
||||
if (word.length() > 2) {
|
||||
list.add(word);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static String handleNameSurname(String input) {
|
||||
input = input.toLowerCase().replace(".", "");
|
||||
if (input.startsWith("dr")) {
|
||||
input = input.substring(3);
|
||||
}
|
||||
|
||||
return StringUtils
|
||||
.stripAccents(input.trim())
|
||||
.replaceAll("[^a-z\\s]+", " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
private static <I extends Result> void makeReport(SparkSession spark, String inputPath, Class<I> entityClazz,
|
||||
String outputPath, String preparedInfoPath,
|
||||
Dataset<OrcidAuthotitative> authoritative) {
|
||||
|
@ -125,84 +176,7 @@ public class MakeReportSparkJob implements Serializable {
|
|||
.equalTo(resultInfo.col("orcid")),
|
||||
"left")
|
||||
.map((MapFunction<Tuple2<ResultInfo, OrcidAuthotitative>, Tuple2<String, ReportInfo>>) pair -> {
|
||||
Optional<OrcidAuthotitative> ooa = Optional.ofNullable(pair._2());
|
||||
if (!ooa.isPresent()) {
|
||||
return null;
|
||||
}
|
||||
OrcidAuthotitative oa = ooa.get();
|
||||
|
||||
ResultInfo ri = pair._1();
|
||||
|
||||
if (StringUtils.isBlank(ri.getSurname())) {
|
||||
PacePerson pp = new PacePerson(ri.getFullname(), false);
|
||||
ri.setSurname(pp.getNormalisedSurname());
|
||||
ri.setName(pp.getNormalisedFirstName());
|
||||
}
|
||||
ReportInfo reportInfo = new ReportInfo();
|
||||
reportInfo.setOid(oa.getOid());
|
||||
reportInfo.setOname(oa.getName());
|
||||
reportInfo.setOsurname(oa.getSurname());
|
||||
reportInfo.setOcreditname(oa.getCreditname());
|
||||
reportInfo.setAssociatedAuthors(Arrays.asList(ri));
|
||||
|
||||
if (!Optional.ofNullable(oa.getSurname()).isPresent()) {
|
||||
return new Tuple2<>("missing", reportInfo);
|
||||
}
|
||||
final String handledOsurname = StringUtils
|
||||
.stripAccents(oa.getSurname().toLowerCase().trim())
|
||||
.replace("-", " ")
|
||||
.replace(".", "");
|
||||
final String handledSurname = StringUtils
|
||||
.stripAccents(ri.getSurname().toLowerCase().trim())
|
||||
.replace("-", " ")
|
||||
.replace(".", "");
|
||||
if (!handledOsurname
|
||||
.equalsIgnoreCase(handledSurname)) {
|
||||
if (!handledOsurname.contains(handledSurname) && !handledSurname.contains(handledOsurname)) {
|
||||
// check if the words composing the name and the surname are the same or one list contains the
|
||||
// other.
|
||||
// do for words of lenght bigger than two
|
||||
String handledOname = "";
|
||||
if (Optional.ofNullable(oa.getName()).isPresent()) {
|
||||
handledOname = StringUtils
|
||||
.stripAccents(oa.getName().toLowerCase().trim())
|
||||
.replace("-", " ")
|
||||
.replace(".", "");
|
||||
}
|
||||
String handledName = "";
|
||||
if (Optional.ofNullable(ri.getName()).isPresent()) {
|
||||
handledName = StringUtils
|
||||
.stripAccents(ri.getName().toLowerCase().trim())
|
||||
.replace("-", " ")
|
||||
.replace(".", "");
|
||||
}
|
||||
|
||||
final List<String> orcidList = new ArrayList<>();
|
||||
final List<String> paperList = new ArrayList<>();
|
||||
|
||||
addInList(orcidList, handledOname);
|
||||
addInList(orcidList, handledOsurname);
|
||||
|
||||
addInList(paperList, handledSurname);
|
||||
addInList(paperList, handledName);
|
||||
|
||||
if (orcidList.size() <= paperList.size()) {
|
||||
if (searchIn(paperList, orcidList)) {
|
||||
return new Tuple2<>("check", reportInfo);
|
||||
}
|
||||
} else {
|
||||
if (searchIn(orcidList, paperList)) {
|
||||
return new Tuple2<>("check", reportInfo);
|
||||
}
|
||||
}
|
||||
|
||||
// todo add another level of checking (use levenstein)
|
||||
return new Tuple2<>("wrong", reportInfo);
|
||||
}
|
||||
return new Tuple2<>("right", reportInfo);
|
||||
}
|
||||
|
||||
return new Tuple2<>("right", reportInfo);
|
||||
return getStringReportInfoFuzzyTuple2(pair);
|
||||
|
||||
}, Encoders.tuple(Encoders.STRING(), Encoders.bean(ReportInfo.class)))
|
||||
.filter(Objects::nonNull);
|
||||
|
@ -222,6 +196,280 @@ public class MakeReportSparkJob implements Serializable {
|
|||
|
||||
}
|
||||
|
||||
private static double fuzzyMatch(String orcid, String result) {
|
||||
// apply one or more fuzzy functions to determine if the input string match
|
||||
// match 1.0 con fuzzy => giusti
|
||||
// quelli che matchano sopra 0.66 con fuzzy li metto fra i giusti
|
||||
// quelli che non stanno nel match di prima, ma matchano fuzzywizzy sopra 0.5 li metto in check
|
||||
// probabilmente giusti
|
||||
// quelli che matchano fuzzywizzy da 0.5 a 0.3
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
public static Tuple2<String, ReportInfo> getStringReportInfoFuzzyTuple2(
|
||||
Tuple2<ResultInfo, OrcidAuthotitative> pair) {
|
||||
Optional<OrcidAuthotitative> ooa = Optional.ofNullable(pair._2());
|
||||
if (!ooa.isPresent()) {
|
||||
return null;
|
||||
}
|
||||
OrcidAuthotitative oa = ooa.get();
|
||||
|
||||
ResultInfo ri = pair._1();
|
||||
|
||||
if (StringUtils.isBlank(ri.getSurname())) {
|
||||
PacePerson pp = new PacePerson(ri.getFullname(), false);
|
||||
ri.setSurname(pp.getNormalisedSurname());
|
||||
ri.setName(pp.getNormalisedFirstName());
|
||||
}
|
||||
ReportInfo reportInfo = new ReportInfo();
|
||||
reportInfo.setOid(oa.getOid());
|
||||
reportInfo.setOname(oa.getName());
|
||||
reportInfo.setOsurname(oa.getSurname());
|
||||
reportInfo.setOcreditname(oa.getCreditName());
|
||||
reportInfo.setAssociatedAuthors(Arrays.asList(ri));
|
||||
|
||||
int level = 1;
|
||||
|
||||
if (!Optional.ofNullable(oa.getSurname()).isPresent()) {
|
||||
return new Tuple2<>("missing", reportInfo);
|
||||
}
|
||||
final String handledOsurname = handleNameSurname(oa.getSurname());
|
||||
|
||||
if (handledOsurname.equalsIgnoreCase("")) {
|
||||
return new Tuple2<>("missing", reportInfo);
|
||||
}
|
||||
final String handledSurname = handleNameSurname(ri.getSurname());
|
||||
|
||||
if (handledSurname.equals("")) {
|
||||
return new Tuple2<>("missing", reportInfo);
|
||||
}
|
||||
|
||||
String handledOname = "";
|
||||
if (Optional.ofNullable(oa.getName()).isPresent()) {
|
||||
handledOname = handleNameSurname(oa.getName());
|
||||
}
|
||||
String handledName = "";
|
||||
if (Optional.ofNullable(ri.getName()).isPresent()) {
|
||||
handledName = handleNameSurname(ri.getName());
|
||||
}
|
||||
|
||||
String[][] input = {
|
||||
{
|
||||
"1", handledOsurname + " " + handledOname
|
||||
},
|
||||
{
|
||||
"2", handledSurname + " " + handledName
|
||||
}
|
||||
};
|
||||
// check if there is neither a common word. If there is not they are obviously wrong
|
||||
if (Math.round((1 - new CosineDistance().apply(input[0][1], input[1][1])) * 100) == 0) {
|
||||
MatchService matchService = new MatchService();
|
||||
|
||||
List<Document> documentList = Arrays.asList(input).stream().map(contact -> {
|
||||
return new Document.Builder(contact[0])
|
||||
.addElement(
|
||||
new Element.Builder<String>()
|
||||
.setValue(contact[1])
|
||||
.setType(ElementType.NAME)
|
||||
.createElement())
|
||||
.createDocument();
|
||||
}).collect(Collectors.toList());
|
||||
if (matchService.applyMatchByDocId(documentList).entrySet().size() == 0) {
|
||||
if (FuzzySearch.ratio(input[0][1], input[1][1]) < 30) {
|
||||
return new Tuple2<>("wrong", reportInfo);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// // they have some words in common. check if orcid provides creditName or otherNames to check for distance
|
||||
// //
|
||||
// List<Document> documentList = Arrays.asList(input).stream().map(contact -> {
|
||||
// return new Document.Builder(contact[0])
|
||||
// .addElement(
|
||||
// new Element.Builder<String>()
|
||||
// .setValue(contact[1])
|
||||
// .setType(ElementType.NAME)
|
||||
// .createElement())
|
||||
// .createDocument();
|
||||
// }).collect(Collectors.toList());
|
||||
//
|
||||
// MatchService matchService = new MatchService();
|
||||
//
|
||||
// Map<String, List<Match<Document>>> result = matchService.applyMatchByDocId(documentList);
|
||||
//
|
||||
// if (result.entrySet().size() > 0) {
|
||||
// reportInfo.setLevel("fuzzyMatch");
|
||||
// return new Tuple2<>("right", reportInfo);
|
||||
// }
|
||||
|
||||
return new Tuple2<>("check", reportInfo);
|
||||
}
|
||||
|
||||
public static Tuple2<String, ReportInfo> getStringReportInfoTuple2(Tuple2<ResultInfo, OrcidAuthotitative> pair) {
|
||||
Optional<OrcidAuthotitative> ooa = Optional.ofNullable(pair._2());
|
||||
if (!ooa.isPresent()) {
|
||||
return null;
|
||||
}
|
||||
OrcidAuthotitative oa = ooa.get();
|
||||
|
||||
ResultInfo ri = pair._1();
|
||||
|
||||
if (StringUtils.isBlank(ri.getSurname())) {
|
||||
PacePerson pp = new PacePerson(ri.getFullname(), false);
|
||||
ri.setSurname(pp.getNormalisedSurname());
|
||||
ri.setName(pp.getNormalisedFirstName());
|
||||
}
|
||||
ReportInfo reportInfo = new ReportInfo();
|
||||
reportInfo.setOid(oa.getOid());
|
||||
reportInfo.setOname(oa.getName());
|
||||
reportInfo.setOsurname(oa.getSurname());
|
||||
reportInfo.setOcreditname(oa.getCreditName());
|
||||
reportInfo.setAssociatedAuthors(Arrays.asList(ri));
|
||||
|
||||
int level = 1;
|
||||
|
||||
if (!Optional.ofNullable(oa.getSurname()).isPresent()) {
|
||||
return new Tuple2<>("missing", reportInfo);
|
||||
}
|
||||
final String handledOsurname = handleNameSurname(oa.getSurname());
|
||||
|
||||
if (handledOsurname.equalsIgnoreCase("")) {
|
||||
return new Tuple2<>("missing", reportInfo);
|
||||
}
|
||||
final String handledSurname = handleNameSurname(ri.getSurname());
|
||||
|
||||
if (handledSurname.equals("")) {
|
||||
return new Tuple2<>("missing", reportInfo);
|
||||
}
|
||||
|
||||
// check if oSurname and surname are equals
|
||||
if (handledOsurname.equals(handledSurname)) {
|
||||
reportInfo.setLevel("level" + level);
|
||||
return new Tuple2<>("right", reportInfo);
|
||||
}
|
||||
level++;
|
||||
|
||||
// check if one is contained in the other
|
||||
if (handledOsurname.contains(handledSurname) || handledSurname.contains(handledOsurname)) {
|
||||
reportInfo.setLevel("level" + level);
|
||||
return new Tuple2<>("right", reportInfo);
|
||||
}
|
||||
level++;
|
||||
// check if one of the two is composed of more than one word. In this case concatenate the two words
|
||||
// and check again (Mohammadi Peyhani vs Mohammadipeyhani)
|
||||
String[] handledorcidSplit = handledOsurname.split(" ");
|
||||
String[] handledresultSplit = handledSurname.split(" ");
|
||||
if (handledorcidSplit.length == 2) {
|
||||
String tmpSurname = handledorcidSplit[0] + handledorcidSplit[1];
|
||||
if (tmpSurname.equals(handledSurname)) {
|
||||
reportInfo.setLevel("level" + level);
|
||||
return new Tuple2<>("check", reportInfo);
|
||||
}
|
||||
}
|
||||
if (handledresultSplit.length == 2) {
|
||||
String tmpSurname = handledresultSplit[0] + handledresultSplit[1];
|
||||
if (tmpSurname.equals(handledSurname)) {
|
||||
reportInfo.setLevel("level" + level);
|
||||
return new Tuple2<>("check", reportInfo);
|
||||
}
|
||||
}
|
||||
level++;
|
||||
// check if the words composing the name and the surname are the same or one list contains the
|
||||
// other.
|
||||
// do for words of lenght bigger than two
|
||||
String handledOname = "";
|
||||
if (Optional.ofNullable(oa.getName()).isPresent()) {
|
||||
handledOname = handleNameSurname(oa.getName());
|
||||
}
|
||||
String handledName = "";
|
||||
if (Optional.ofNullable(ri.getName()).isPresent()) {
|
||||
handledName = handleNameSurname(ri.getName());
|
||||
}
|
||||
|
||||
final List<String> orcidList = new ArrayList<>();
|
||||
final List<String> paperList = new ArrayList<>();
|
||||
|
||||
addInList(orcidList, handledOname);
|
||||
addInList(orcidList, handledOsurname);
|
||||
|
||||
addInList(paperList, handledSurname);
|
||||
addInList(paperList, handledName);
|
||||
|
||||
if (checkListContainment(reportInfo, level, orcidList, paperList))
|
||||
return new Tuple2<>("check", reportInfo);
|
||||
level++;
|
||||
|
||||
handledorcidSplit = handledOsurname.split(" ");
|
||||
handledresultSplit = handledName.split(" ");
|
||||
|
||||
if (handledorcidSplit.length == 2) {
|
||||
orcidList.clear();
|
||||
orcidList.add(handledorcidSplit[0] + handledorcidSplit[1]);
|
||||
addInList(orcidList, handledOname);
|
||||
if (checkListContainment(reportInfo, level, orcidList, paperList)) {
|
||||
return new Tuple2<>("check", reportInfo);
|
||||
}
|
||||
orcidList.clear();
|
||||
orcidList.add(handledorcidSplit[1] + handledorcidSplit[0]);
|
||||
addInList(orcidList, handledOname);
|
||||
if (checkListContainment(reportInfo, level, orcidList, paperList)) {
|
||||
return new Tuple2<>("check", reportInfo);
|
||||
}
|
||||
}
|
||||
if (handledresultSplit.length == 2) {
|
||||
orcidList.clear();
|
||||
addInList(orcidList, handledOname);
|
||||
addInList(orcidList, handledOsurname);
|
||||
paperList.clear();
|
||||
paperList.add(handledresultSplit[0] + handledresultSplit[1]);
|
||||
addInList(paperList, handledSurname);
|
||||
if (checkListContainment(reportInfo, level, orcidList, paperList))
|
||||
return new Tuple2<>("check", reportInfo);
|
||||
paperList.clear();
|
||||
paperList.add(handledresultSplit[1] + handledresultSplit[0]);
|
||||
addInList(paperList, handledSurname);
|
||||
if (checkListContainment(reportInfo, level, orcidList, paperList))
|
||||
return new Tuple2<>("check", reportInfo);
|
||||
}
|
||||
level++;
|
||||
|
||||
if (handledOsurname.length() > 3 && handledSurname.length() > 3) {
|
||||
LevenshteinDistance l = new LevenshteinDistance();
|
||||
if (l.apply(handledOsurname, handledSurname) <= 2) {
|
||||
reportInfo.setLevel("level" + level);
|
||||
return new Tuple2<>("check", reportInfo);
|
||||
}
|
||||
}
|
||||
if (handledOsurname.length() > 3 && handledName.length() > 3) {
|
||||
LevenshteinDistance l = new LevenshteinDistance();
|
||||
if (l.apply(handledOsurname, handledName) <= 2) {
|
||||
reportInfo.setLevel("level" + level);
|
||||
return new Tuple2<>("check", reportInfo);
|
||||
}
|
||||
}
|
||||
|
||||
return new Tuple2<>("wrong", reportInfo);
|
||||
}
|
||||
|
||||
private static boolean checkListContainment(ReportInfo reportInfo, int level, List<String> orcidList,
|
||||
List<String> paperList) {
|
||||
if (orcidList.size() <= paperList.size()) {
|
||||
if (searchIn(paperList, orcidList)) {
|
||||
reportInfo.setLevel("level" + level);
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
if (searchIn(orcidList, paperList)) {
|
||||
reportInfo.setLevel("level" + level);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* searches in list1 all the words of list 2
|
||||
* @param list1 the list where to search for the words
|
||||
|
|
|
@ -2,19 +2,47 @@
|
|||
package eu.dnetlib.dhp.oa.graph.clean.authorpids;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class OrcidAuthotitative implements Serializable {
|
||||
private String oid;
|
||||
private String name;
|
||||
private String surname;
|
||||
private String creditname;
|
||||
private String creditName;
|
||||
private String otherName;
|
||||
private List<String> otherNames;
|
||||
private String errorCode;
|
||||
|
||||
public String getCreditname() {
|
||||
return creditname;
|
||||
public String getOtherName() {
|
||||
return otherName;
|
||||
}
|
||||
|
||||
public void setCreditname(String creditname) {
|
||||
this.creditname = creditname;
|
||||
public void setOtherName(String otherName) {
|
||||
this.otherName = otherName;
|
||||
}
|
||||
|
||||
public List<String> getOtherNames() {
|
||||
return otherNames;
|
||||
}
|
||||
|
||||
public void setOtherNames(List<String> otherNames) {
|
||||
this.otherNames = otherNames;
|
||||
}
|
||||
|
||||
public String getErrorCode() {
|
||||
return errorCode;
|
||||
}
|
||||
|
||||
public void setErrorCode(String errorCode) {
|
||||
this.errorCode = errorCode;
|
||||
}
|
||||
|
||||
public String getCreditName() {
|
||||
return creditName;
|
||||
}
|
||||
|
||||
public void setCreditName(String creditName) {
|
||||
this.creditName = creditName;
|
||||
}
|
||||
|
||||
public String getOid() {
|
||||
|
|
|
@ -73,7 +73,7 @@ public class PrepareResultsSparkJob implements Serializable {
|
|||
|
||||
result.createOrReplaceTempView("result");
|
||||
|
||||
String query = "select auth.name name, auth.surname surname, auth.fullname fullname, pIde.value orcid, id, cf.value collectedfrom"
|
||||
String query = "select auth.name name, auth.surname surname, auth.fullname fullname, pIde.value orcid, id, cf.value collectedfrom "
|
||||
+
|
||||
"from result " +
|
||||
"lateral view explode(author) a as auth " +
|
||||
|
|
|
@ -12,6 +12,16 @@ public class ReportInfo implements Serializable {
|
|||
|
||||
private List<ResultInfo> associatedAuthors;
|
||||
|
||||
private String level;
|
||||
|
||||
public String getLevel() {
|
||||
return level;
|
||||
}
|
||||
|
||||
public void setLevel(String level) {
|
||||
this.level = level;
|
||||
}
|
||||
|
||||
public String getOid() {
|
||||
return oid;
|
||||
}
|
||||
|
|
|
@ -404,7 +404,7 @@
|
|||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/dataset</arg>
|
||||
<arg>--outputPath</arg><arg>${utputPath}/dataset</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
|
||||
<arg>--orcidInputPath</arg><arg>${orcidInputPath}</arg>
|
||||
|
|
|
@ -182,7 +182,7 @@
|
|||
<decision name="cleanorreport">
|
||||
<switch>
|
||||
<case to="make_report">${wf:conf('clean') eq false}</case>
|
||||
<case to="clean_orcid_copy">${wf:conf('clean') eq true}</case>
|
||||
<case to="clean_orcid">${wf:conf('clean') eq true}</case>
|
||||
<default to="make_report"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
|
|
@ -1,11 +1,19 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.clean;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.*;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.text.similarity.CosineDistance;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -21,8 +29,19 @@ import org.slf4j.Logger;
|
|||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.gson.Gson;
|
||||
import com.intuit.fuzzymatcher.component.MatchService;
|
||||
import com.intuit.fuzzymatcher.domain.Document;
|
||||
import com.intuit.fuzzymatcher.domain.Element;
|
||||
import com.intuit.fuzzymatcher.domain.ElementType;
|
||||
import com.intuit.fuzzymatcher.domain.Match;
|
||||
import com.wcohen.ss.Levenstein;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.clean.authorpids.*;
|
||||
import jdk.nashorn.internal.ir.annotations.Ignore;
|
||||
import me.xdrop.fuzzywuzzy.FuzzySearch;
|
||||
import net.sf.saxon.trans.Maker;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class CleanOrcidTest {
|
||||
|
||||
|
@ -34,6 +53,106 @@ public class CleanOrcidTest {
|
|||
|
||||
private static final Logger log = LoggerFactory.getLogger(CleanOrcidTest.class);
|
||||
|
||||
// needed for fuzzywuzzy to get a lower bound ratio under which the authors are most probably different
|
||||
String[][] wrong = {
|
||||
{
|
||||
"1", MakeReportSparkJob.handleNameSurname("Alex Bullock")
|
||||
},
|
||||
{
|
||||
"2", MakeReportSparkJob.handleNameSurname("Gillian Farnie")
|
||||
},
|
||||
{
|
||||
"3", MakeReportSparkJob.handleNameSurname("Luís Rocha")
|
||||
},
|
||||
{
|
||||
"4", MakeReportSparkJob.handleNameSurname("Pedro Relvas")
|
||||
},
|
||||
|
||||
{
|
||||
"9", MakeReportSparkJob.handleNameSurname("Prasanth Manohar")
|
||||
},
|
||||
{
|
||||
"10", MakeReportSparkJob.handleNameSurname("Nachimuthu Ramesh")
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
String[][] input = {
|
||||
{
|
||||
"1", MakeReportSparkJob.handleNameSurname("Dr. Ulrike Elsdoerfer Ph.D.")
|
||||
},
|
||||
{
|
||||
"2", MakeReportSparkJob.handleNameSurname("Ulrike Elsdörfer")
|
||||
},
|
||||
{
|
||||
"3", MakeReportSparkJob.handleNameSurname("Steven Ossont")
|
||||
},
|
||||
{
|
||||
"4", MakeReportSparkJob.handleNameSurname("Steven J. Johnston")
|
||||
},
|
||||
{
|
||||
"5", MakeReportSparkJob.handleNameSurname("Joanna Molyn")
|
||||
},
|
||||
{
|
||||
"6", MakeReportSparkJob.handleNameSurname("Joanna Molyn-Blanchfield")
|
||||
},
|
||||
{
|
||||
"7", MakeReportSparkJob.handleNameSurname("Zhang Tian-Tuo")
|
||||
},
|
||||
{
|
||||
"8", MakeReportSparkJob.handleNameSurname("Zhang Tiantuo")
|
||||
},
|
||||
{
|
||||
"9", MakeReportSparkJob.handleNameSurname("Prasanth Manohar")
|
||||
},
|
||||
{
|
||||
"10", MakeReportSparkJob.handleNameSurname("Nachimuthu Ramesh")
|
||||
},
|
||||
{
|
||||
"9", MakeReportSparkJob.handleNameSurname("Hassan Ahmed")
|
||||
},
|
||||
{
|
||||
"10", MakeReportSparkJob.handleNameSurname("Hassan Mohamed")
|
||||
},
|
||||
{
|
||||
"11", MakeReportSparkJob.handleNameSurname("Jonathan ODonnell")
|
||||
},
|
||||
{
|
||||
"12", MakeReportSparkJob.handleNameSurname("Jonathon A. O Dannell")
|
||||
},
|
||||
{
|
||||
"11", MakeReportSparkJob.handleNameSurname("Amilcar António Teiga Teixeira")
|
||||
},
|
||||
{
|
||||
"12", MakeReportSparkJob.handleNameSurname("Amílcar Teixeira")
|
||||
},
|
||||
{
|
||||
"13", MakeReportSparkJob.handleNameSurname("Bruno Rossion")
|
||||
},
|
||||
{
|
||||
"14", MakeReportSparkJob.handleNameSurname("B. Rossion")
|
||||
},
|
||||
{
|
||||
"15", MakeReportSparkJob.handleNameSurname("TINGYOU WANG")
|
||||
},
|
||||
{
|
||||
"16", MakeReportSparkJob.handleNameSurname("Wang Ting-You")
|
||||
},
|
||||
{
|
||||
"17", MakeReportSparkJob.handleNameSurname("Jacob Moran-Gilad")
|
||||
},
|
||||
{
|
||||
"18", MakeReportSparkJob.handleNameSurname("Moran-Gilad Jacon")
|
||||
},
|
||||
{
|
||||
"19", MakeReportSparkJob.handleNameSurname("Adelle Semmler")
|
||||
},
|
||||
{
|
||||
"20", MakeReportSparkJob.handleNameSurname("Adelle Craig")
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files.createTempDirectory(CleanOrcidTest.class.getSimpleName());
|
||||
|
@ -168,4 +287,151 @@ public class CleanOrcidTest {
|
|||
.map(item -> OBJECT_MAPPER.readValue(item, ResultInfo.class));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void cleanNameSurname() {
|
||||
String name = "Hübner";
|
||||
String surname = "Hubenr";
|
||||
|
||||
name = StringUtils
|
||||
.stripAccents(name.toLowerCase().trim())
|
||||
.replaceAll("[^a-z\\s]+", " ");
|
||||
|
||||
surname = StringUtils
|
||||
.stripAccents(surname.toLowerCase().trim())
|
||||
.replace(".", "")
|
||||
.replaceAll("[^a-z\\s]+", " ")
|
||||
.replace("'", " ")
|
||||
.trim();
|
||||
|
||||
Levenstein l = new Levenstein();
|
||||
double score = Math.abs(l.score(name, surname));
|
||||
|
||||
System.out.println(score);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMakeReport() {
|
||||
ResultInfo ri = new ResultInfo();
|
||||
ri.setName("Prasanth");
|
||||
ri.setSurname("Manohar");
|
||||
|
||||
OrcidAuthotitative oa = new OrcidAuthotitative();
|
||||
oa.setName("Nachimuthu");
|
||||
oa.setSurname("Ramesh");
|
||||
|
||||
Tuple2<ResultInfo, OrcidAuthotitative> t2 = new Tuple2<ResultInfo, OrcidAuthotitative>(ri, oa);
|
||||
Tuple2<String, ReportInfo> tmp = MakeReportSparkJob.getStringReportInfoFuzzyTuple2(t2);
|
||||
|
||||
System.out.println(new Gson().toJson(tmp._2(), ReportInfo.class));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void cosineDistanceTest() {
|
||||
|
||||
for (int i = 0; i < input.length; i += 2) {
|
||||
double cosineDistance = new CosineDistance().apply(input[i][1], input[i + 1][1]);
|
||||
System.out
|
||||
.println(
|
||||
"CosineDistance of '" + input[i][1] + "' & '" + input[i + 1][1] + "' | Words in strings are "
|
||||
+ Math.round(cosineDistance * 100) + "% dis-similar or "
|
||||
+ Math.round((1 - cosineDistance) * 100) + "% similar.");
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAuthorFuzzyMatch() {
|
||||
|
||||
Function<String, String> clean = s -> MakeReportSparkJob.handleNameSurname(s);
|
||||
|
||||
List<Document> documentList = Arrays.asList(input).stream().map(contact -> {
|
||||
return new Document.Builder(contact[0])
|
||||
.addElement(
|
||||
new Element.Builder<String>()
|
||||
.setValue(contact[1])
|
||||
.setType(ElementType.NAME)
|
||||
.setPreProcessingFunction(clean)
|
||||
.createElement())
|
||||
.createDocument();
|
||||
}).collect(Collectors.toList());
|
||||
|
||||
MatchService matchService = new MatchService();
|
||||
|
||||
Map<String, List<Match<Document>>> result = matchService.applyMatchByDocId(documentList);
|
||||
|
||||
result.entrySet().forEach(entry -> {
|
||||
entry.getValue().forEach(match -> {
|
||||
System.out
|
||||
.println(
|
||||
"Data: " + match.getData() + " Matched With: " + match.getMatchedWith() + " Score: "
|
||||
+ match.getScore().getResult());
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void FuzzyWuzzyTest() {
|
||||
applyFuzzyWuzzy(input);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void FuzzyWuzzyWrongTest() throws IOException {
|
||||
final String inputPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/clean/wrongassociation.json")
|
||||
.getPath();
|
||||
|
||||
BufferedReader reader = new BufferedReader(new FileReader(inputPath));
|
||||
String line;
|
||||
List<OrcidAuthor> orcidAuthorList = new ArrayList<>();
|
||||
while (null != (line = reader.readLine())) {
|
||||
orcidAuthorList.add(new Gson().fromJson(line, OrcidAuthor.class));
|
||||
}
|
||||
|
||||
applyFuzzyWuzzy(orcidAuthorList);
|
||||
}
|
||||
|
||||
private void applyFuzzyWuzzy(List<OrcidAuthor> orcidAuthorList) {
|
||||
orcidAuthorList.forEach(entry -> {
|
||||
String orcid = MakeReportSparkJob.handleNameSurname(entry.getOrcid());
|
||||
String result = MakeReportSparkJob.handleNameSurname(entry.getResult());
|
||||
System.out
|
||||
.println(
|
||||
"FuzzyWuzzy of '" + orcid + "' & '" + result + "' | Similarity ratio "
|
||||
+ FuzzySearch.ratio(orcid, result));
|
||||
});
|
||||
}
|
||||
|
||||
private void applyFuzzyWuzzy(String[][] input) {
|
||||
for (int i = 0; i < input.length; i += 2) {
|
||||
System.out
|
||||
.println(
|
||||
"FuzzyWuzzy of '" + input[i][1] + "' & '" + input[i + 1][1] + "' | Similarity ratio "
|
||||
+ FuzzySearch.ratio(input[i][1], input[i + 1][1]));
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
class OrcidAuthor implements Serializable {
|
||||
|
||||
private String orcid;
|
||||
private String result;
|
||||
|
||||
public String getOrcid() {
|
||||
return orcid;
|
||||
}
|
||||
|
||||
public void setOrcid(String orcid) {
|
||||
this.orcid = orcid;
|
||||
}
|
||||
|
||||
public String getResult() {
|
||||
return result;
|
||||
}
|
||||
|
||||
public void setResult(String result) {
|
||||
this.result = result;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
{"orcid":"Alex Bullock" ,"result": "Gillian Farnie"}
|
||||
{"orcid": "Luís Rocha", "result":"Pedro Relvas"}
|
||||
{"orcid": "Prasanth Manohar", "result": "Nachimuthu Ramesh"}
|
||||
{"orcid": "Zhiying Lin", "result":"Guanglong Huang"}
|
||||
{"orcid":"Andrew Golnar","result":"Kim Pepin"}
|
||||
{"orcid": "Gilles Marcou", "result":"Filippo Lunghini"}
|
||||
{"orcid": "Philip Hahn", "result":"John Maron"}
|
||||
{"orcid": "Kirsty Gibson", "result":"Kim R. Hardie"}
|
||||
{"orcid": "Paula Lago", "result":"Shingo Takeda"}
|
||||
{"orcid": "Paul Seidler", "result":"Dalziel J. Wilson"}
|
||||
{"orcid": "Solomon Okunade", "result":"Rufus Adebayo Ajisafe"}
|
||||
{"orcid": "Emi Arai", "result":"Masaru Hasegawa"}
|
||||
{"orcid": "Dr Muhammad Yameen Sandhu", "result":"Nutapong Somjit"}
|
||||
{"orcid": "Xianlei Cai", "result":"Weiming Yu"}
|
||||
{"orcid": "Bing He", "result":"Chuan Xing"}
|
||||
{"orcid": "JULIEN COURCHET", "result":"Franck Polleux"}
|
||||
{"orcid": "Xiaoyun Pan", "result":"Liru Chen"}
|
||||
{"orcid": "Marianne Okal", "result":"Brendan Hodge"}
|
||||
{"orcid": "Michal Fereczkowski", "result":"Silje Grini Nielsen"}
|
||||
{"orcid": "Nobuyuki Nakai", "result":"Tadafumi Kurogi"}
|
||||
{"orcid": "Colin Daniel", "result":"Christine Cuyler"}
|
||||
{"orcid": "Xavier Arnan", "result":"Anna Torné-Noguera"}
|
||||
{"orcid": "Denita Hadziabdic", "result":"Meher Ony"}
|
||||
{"orcid": "Kor de Jong", "result":"K. Koning"}
|
||||
{"orcid": "Chaya Patel", "result":"David Leib"}
|
||||
{"orcid": "Fagner Carniel", "result":"Adonai Lacruz"}
|
||||
{"orcid": "Carrie Peltz", "result":"Erica Kornblith"}
|
||||
{"orcid": "Kathryn Huyvaert", "result":"Larissa L. Bailey"}
|
||||
{"orcid": "Christine Provost", "result":"Nathalie Sennéchael"}
|
||||
{"orcid": "Nancy Pachana", "result":"Lisa DiNatale"}
|
||||
{"orcid": "ARDESHIR BAYAT", "result":"P. Marcos Gorresen"}
|
||||
{"orcid": "Paul Berkowitz", "result":"Silje Grini Nielsen"}
|
||||
{"orcid": "Alice Laciny", "result":"Brian Metscher"}
|
||||
{"orcid": "Octavio Rojas", "result":"Josie A. Griffin"}
|
||||
{"orcid": "Carlo Sandroni", "result":"Riccardo Scattolini"}
|
Loading…
Reference in New Issue