forked from D-Net/dnet-hadoop
added comment
This commit is contained in:
parent
6f4d0c05ea
commit
59111713fa
|
@ -1,17 +1,11 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.merge;
|
||||
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.text.Normalizer;
|
||||
import java.util.*;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.lang3.tuple.MutablePair;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import com.wcohen.ss.JaroWinkler;
|
||||
|
@ -161,42 +155,8 @@ public class AuthorMerger {
|
|||
.replaceAll("(\\n)+", " ")
|
||||
|
||||
.trim();
|
||||
// return Arrays.stream(fullname.split("[\\s | , | ;]+")).map(String::toLowerCase).sorted().collect(Collectors.joining());
|
||||
}
|
||||
|
||||
//
|
||||
// public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {
|
||||
// if (baseAuthor == null || baseAuthor.isEmpty())
|
||||
// return orcidAuthor;
|
||||
//
|
||||
// if (orcidAuthor == null || orcidAuthor.isEmpty())
|
||||
// return baseAuthor;
|
||||
//
|
||||
// if (baseAuthor.size() == 1 && orcidAuthor.size() > 10)
|
||||
// return baseAuthor;
|
||||
//
|
||||
//
|
||||
// Map<String, List<Author>> pubClusters = baseAuthor.stream().collect(Collectors.toMap(AuthorMerger::generateAuthorkey, Arrays::asList, (a, b) -> {
|
||||
// a.addAll(b);
|
||||
// return a;
|
||||
// }));
|
||||
//
|
||||
// Map<String, List<Author>> orcidClusters = baseAuthor.stream().collect(Collectors.toMap(AuthorMerger::generateAuthorkey, Arrays::asList, (a, b) -> {
|
||||
// a.addAll(b);
|
||||
// return a;
|
||||
// }));
|
||||
//
|
||||
// System.out.println(pubClusters.keySet().size());
|
||||
// System.out.println(orcidClusters.keySet().size());
|
||||
//
|
||||
//
|
||||
//
|
||||
//
|
||||
// return null;
|
||||
//
|
||||
//
|
||||
// }
|
||||
|
||||
static int hammingDist(String str1, String str2) {
|
||||
if (str1.length() != str2.length())
|
||||
return Math.max(str1.length(), str2.length());
|
||||
|
@ -220,7 +180,14 @@ public class AuthorMerger {
|
|||
return null;
|
||||
}
|
||||
|
||||
public static boolean checkSimilarity2(final Author left, final Author right) {
|
||||
/**
|
||||
* This method tries to figure out when two author are the same in the contest
|
||||
* of ORCID enrichment
|
||||
* @param left Author in the OAF entity
|
||||
* @param right Author ORCID
|
||||
* @return based on a heuristic on the names of the authors if they are the same.
|
||||
*/
|
||||
public static boolean checkORCIDSimilarity(final Author left, final Author right) {
|
||||
final Person pl = parse(left);
|
||||
final Person pr = parse(right);
|
||||
|
||||
|
@ -267,8 +234,16 @@ public class AuthorMerger {
|
|||
else
|
||||
return false;
|
||||
}
|
||||
//
|
||||
|
||||
public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {
|
||||
|
||||
/**
|
||||
* Method to enrich ORCID information in one list of authors based on another list
|
||||
* @param baseAuthor the Author List in the OAF Entity
|
||||
* @param orcidAuthor The list of ORCID Author intersected
|
||||
* @return The Author List of the OAF Entity enriched with the orcid Author
|
||||
*/
|
||||
public static List<Author> enrichOrcid(List<Author> baseAuthor, List<Author> orcidAuthor) {
|
||||
|
||||
if (baseAuthor == null || baseAuthor.isEmpty())
|
||||
return orcidAuthor;
|
||||
|
@ -283,7 +258,7 @@ public class AuthorMerger {
|
|||
oAuthor.addAll(orcidAuthor);
|
||||
|
||||
baseAuthor.forEach(ba -> {
|
||||
Optional<Author> aMatch = oAuthor.stream().filter(oa -> checkSimilarity2(ba, oa)).findFirst();
|
||||
Optional<Author> aMatch = oAuthor.stream().filter(oa -> checkORCIDSimilarity(ba, oa)).findFirst();
|
||||
if (aMatch.isPresent()) {
|
||||
final Author sameAuthor = aMatch.get();
|
||||
addPid(ba, sameAuthor.getPid());
|
||||
|
@ -293,40 +268,6 @@ public class AuthorMerger {
|
|||
return baseAuthor;
|
||||
}
|
||||
|
||||
public static List<Author> enrichOrcid(List<Author> baseAuthor, List<Author> orcidAuthor) {
|
||||
|
||||
if (baseAuthor == null || baseAuthor.isEmpty())
|
||||
return orcidAuthor;
|
||||
|
||||
if (orcidAuthor == null || orcidAuthor.isEmpty())
|
||||
return baseAuthor;
|
||||
|
||||
if (baseAuthor.size() == 1 && orcidAuthor.size() > 10)
|
||||
return baseAuthor;
|
||||
|
||||
final Double similarityMatrix[][] = new Double[baseAuthor.size()][orcidAuthor.size()];
|
||||
|
||||
final List<SimilarityCellInfo> maxColums = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < orcidAuthor.size(); i++)
|
||||
maxColums.add(new SimilarityCellInfo());
|
||||
|
||||
for (int i = 0; i < baseAuthor.size(); i++) {
|
||||
for (int j = 0; j < orcidAuthor.size(); j++) {
|
||||
similarityMatrix[i][j] = sim(baseAuthor.get(i), orcidAuthor.get(j));
|
||||
if (maxColums.get(j).maxColumnSimilarity < similarityMatrix[i][j])
|
||||
maxColums.get(j).setValues(i, j, similarityMatrix[i][j]);
|
||||
}
|
||||
}
|
||||
maxColums
|
||||
.stream()
|
||||
.sorted()
|
||||
.filter(si -> si.maxColumnSimilarity > 0.85)
|
||||
.forEach(si -> addPid(baseAuthor.get(si.authorPosition), orcidAuthor.get(si.orcidPosition).getPid()));
|
||||
return baseAuthor;
|
||||
|
||||
}
|
||||
|
||||
private static void addPid(final Author a, final List<StructuredProperty> pids) {
|
||||
|
||||
if (a.getPid() == null) {
|
||||
|
|
|
@ -4,13 +4,9 @@ package eu.dnetlib.oa.merge;
|
|||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.platform.commons.util.StringUtils;
|
||||
|
@ -67,7 +63,7 @@ public class AuthorMergerTest {
|
|||
long start = System.currentTimeMillis();
|
||||
|
||||
// final List<Author> enrichedList = AuthorMerger.enrichOrcid(publicationAuthors, orcidAuthors);
|
||||
final List<Author> enrichedList = AuthorMerger.enrichOrcid2(publicationAuthors, orcidAuthors);
|
||||
final List<Author> enrichedList = AuthorMerger.enrichOrcid(publicationAuthors, orcidAuthors);
|
||||
|
||||
long enrichedAuthorWithPid = enrichedList
|
||||
.stream()
|
||||
|
@ -105,7 +101,7 @@ public class AuthorMergerTest {
|
|||
right.setSurname("Anand");
|
||||
right.setFullname("Rachna, Anand");
|
||||
// System.out.println(AuthorMerger.normalize(right.getFullname()));
|
||||
boolean same = AuthorMerger.checkSimilarity2(left, right);
|
||||
boolean same = AuthorMerger.checkORCIDSimilarity(left, right);
|
||||
|
||||
assertTrue(same);
|
||||
|
||||
|
|
|
@ -89,7 +89,7 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
|
|||
p
|
||||
}
|
||||
case (p: Publication, r: Row) =>
|
||||
p.setAuthor(AuthorMerger.enrichOrcid2(p.getAuthor, AuthorEnricher.toOAFAuthor(r)))
|
||||
p.setAuthor(AuthorMerger.enrichOrcid(p.getAuthor, AuthorEnricher.toOAFAuthor(r)))
|
||||
p
|
||||
}
|
||||
.write
|
||||
|
|
Loading…
Reference in New Issue