added comment

This commit is contained in:
Sandro La Bruzzo 2023-11-28 09:00:48 +01:00
parent 6f4d0c05ea
commit 59111713fa
3 changed files with 21 additions and 84 deletions

View File

@ -1,17 +1,11 @@
package eu.dnetlib.dhp.oa.merge; package eu.dnetlib.dhp.oa.merge;
import java.io.FileWriter;
import java.io.IOException;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.*; import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.MutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import com.wcohen.ss.JaroWinkler; import com.wcohen.ss.JaroWinkler;
@ -161,42 +155,8 @@ public class AuthorMerger {
.replaceAll("(\\n)+", " ") .replaceAll("(\\n)+", " ")
.trim(); .trim();
// return Arrays.stream(fullname.split("[\\s | , | ;]+")).map(String::toLowerCase).sorted().collect(Collectors.joining());
} }
//
// public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {
// if (baseAuthor == null || baseAuthor.isEmpty())
// return orcidAuthor;
//
// if (orcidAuthor == null || orcidAuthor.isEmpty())
// return baseAuthor;
//
// if (baseAuthor.size() == 1 && orcidAuthor.size() > 10)
// return baseAuthor;
//
//
// Map<String, List<Author>> pubClusters = baseAuthor.stream().collect(Collectors.toMap(AuthorMerger::generateAuthorkey, Arrays::asList, (a, b) -> {
// a.addAll(b);
// return a;
// }));
//
// Map<String, List<Author>> orcidClusters = baseAuthor.stream().collect(Collectors.toMap(AuthorMerger::generateAuthorkey, Arrays::asList, (a, b) -> {
// a.addAll(b);
// return a;
// }));
//
// System.out.println(pubClusters.keySet().size());
// System.out.println(orcidClusters.keySet().size());
//
//
//
//
// return null;
//
//
// }
static int hammingDist(String str1, String str2) { static int hammingDist(String str1, String str2) {
if (str1.length() != str2.length()) if (str1.length() != str2.length())
return Math.max(str1.length(), str2.length()); return Math.max(str1.length(), str2.length());
@ -220,7 +180,14 @@ public class AuthorMerger {
return null; return null;
} }
public static boolean checkSimilarity2(final Author left, final Author right) { /**
* This method tries to figure out when two author are the same in the contest
* of ORCID enrichment
* @param left Author in the OAF entity
* @param right Author ORCID
* @return based on a heuristic on the names of the authors if they are the same.
*/
public static boolean checkORCIDSimilarity(final Author left, final Author right) {
final Person pl = parse(left); final Person pl = parse(left);
final Person pr = parse(right); final Person pr = parse(right);
@ -267,8 +234,16 @@ public class AuthorMerger {
else else
return false; return false;
} }
//
public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {
/**
* Method to enrich ORCID information in one list of authors based on another list
* @param baseAuthor the Author List in the OAF Entity
* @param orcidAuthor The list of ORCID Author intersected
* @return The Author List of the OAF Entity enriched with the orcid Author
*/
public static List<Author> enrichOrcid(List<Author> baseAuthor, List<Author> orcidAuthor) {
if (baseAuthor == null || baseAuthor.isEmpty()) if (baseAuthor == null || baseAuthor.isEmpty())
return orcidAuthor; return orcidAuthor;
@ -283,7 +258,7 @@ public class AuthorMerger {
oAuthor.addAll(orcidAuthor); oAuthor.addAll(orcidAuthor);
baseAuthor.forEach(ba -> { baseAuthor.forEach(ba -> {
Optional<Author> aMatch = oAuthor.stream().filter(oa -> checkSimilarity2(ba, oa)).findFirst(); Optional<Author> aMatch = oAuthor.stream().filter(oa -> checkORCIDSimilarity(ba, oa)).findFirst();
if (aMatch.isPresent()) { if (aMatch.isPresent()) {
final Author sameAuthor = aMatch.get(); final Author sameAuthor = aMatch.get();
addPid(ba, sameAuthor.getPid()); addPid(ba, sameAuthor.getPid());
@ -293,40 +268,6 @@ public class AuthorMerger {
return baseAuthor; return baseAuthor;
} }
public static List<Author> enrichOrcid(List<Author> baseAuthor, List<Author> orcidAuthor) {
if (baseAuthor == null || baseAuthor.isEmpty())
return orcidAuthor;
if (orcidAuthor == null || orcidAuthor.isEmpty())
return baseAuthor;
if (baseAuthor.size() == 1 && orcidAuthor.size() > 10)
return baseAuthor;
final Double similarityMatrix[][] = new Double[baseAuthor.size()][orcidAuthor.size()];
final List<SimilarityCellInfo> maxColums = new ArrayList<>();
for (int i = 0; i < orcidAuthor.size(); i++)
maxColums.add(new SimilarityCellInfo());
for (int i = 0; i < baseAuthor.size(); i++) {
for (int j = 0; j < orcidAuthor.size(); j++) {
similarityMatrix[i][j] = sim(baseAuthor.get(i), orcidAuthor.get(j));
if (maxColums.get(j).maxColumnSimilarity < similarityMatrix[i][j])
maxColums.get(j).setValues(i, j, similarityMatrix[i][j]);
}
}
maxColums
.stream()
.sorted()
.filter(si -> si.maxColumnSimilarity > 0.85)
.forEach(si -> addPid(baseAuthor.get(si.authorPosition), orcidAuthor.get(si.orcidPosition).getPid()));
return baseAuthor;
}
private static void addPid(final Author a, final List<StructuredProperty> pids) { private static void addPid(final Author a, final List<StructuredProperty> pids) {
if (a.getPid() == null) { if (a.getPid() == null) {

View File

@ -4,13 +4,9 @@ package eu.dnetlib.oa.merge;
import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.*;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.stream.Collectors;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.junit.platform.commons.util.StringUtils; import org.junit.platform.commons.util.StringUtils;
@ -67,7 +63,7 @@ public class AuthorMergerTest {
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
// final List<Author> enrichedList = AuthorMerger.enrichOrcid(publicationAuthors, orcidAuthors); // final List<Author> enrichedList = AuthorMerger.enrichOrcid(publicationAuthors, orcidAuthors);
final List<Author> enrichedList = AuthorMerger.enrichOrcid2(publicationAuthors, orcidAuthors); final List<Author> enrichedList = AuthorMerger.enrichOrcid(publicationAuthors, orcidAuthors);
long enrichedAuthorWithPid = enrichedList long enrichedAuthorWithPid = enrichedList
.stream() .stream()
@ -105,7 +101,7 @@ public class AuthorMergerTest {
right.setSurname("Anand"); right.setSurname("Anand");
right.setFullname("Rachna, Anand"); right.setFullname("Rachna, Anand");
// System.out.println(AuthorMerger.normalize(right.getFullname())); // System.out.println(AuthorMerger.normalize(right.getFullname()));
boolean same = AuthorMerger.checkSimilarity2(left, right); boolean same = AuthorMerger.checkORCIDSimilarity(left, right);
assertTrue(same); assertTrue(same);

View File

@ -89,7 +89,7 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
p p
} }
case (p: Publication, r: Row) => case (p: Publication, r: Row) =>
p.setAuthor(AuthorMerger.enrichOrcid2(p.getAuthor, AuthorEnricher.toOAFAuthor(r))) p.setAuthor(AuthorMerger.enrichOrcid(p.getAuthor, AuthorEnricher.toOAFAuthor(r)))
p p
} }
.write .write