[DoiBoost Author merger] -
This commit is contained in:
parent
41ea1b2177
commit
910abcba04
|
@ -1,52 +1,53 @@
|
||||||
package eu.dnetlib.doiboost;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
package eu.dnetlib.doiboost;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This class stores the association information between the enriching author and the possibly enriched ones.
|
* This class stores the association information between the enriching author and the possibly enriched ones.
|
||||||
* It also contains the value of the similarity score between the enriching author and the possibly enriched ones.
|
* It also contains the value of the similarity score between the enriching author and the possibly enriched ones.
|
||||||
* Possibly enriched authors with the same similarity score with the enriching are put in the to_be_enriched list.
|
* Possibly enriched authors with the same similarity score with the enriching are put in the to_be_enriched list.
|
||||||
*/
|
*/
|
||||||
public class AuthorAssoc implements Serializable {
|
public class AuthorAssoc implements Serializable {
|
||||||
private Double score ;
|
private Double score;
|
||||||
private List<Author> to_be_enriched;
|
private List<Author> to_be_enriched;
|
||||||
private Author with_enricheing_content;
|
private Author with_enricheing_content;
|
||||||
|
|
||||||
public Double getScore() {
|
public Double getScore() {
|
||||||
return score;
|
return score;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setScore(Double score) {
|
public void setScore(Double score) {
|
||||||
this.score = score;
|
this.score = score;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<Author> getTo_be_enriched() {
|
public List<Author> getTo_be_enriched() {
|
||||||
return to_be_enriched;
|
return to_be_enriched;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setTo_be_enriched(List<Author> to_be_enriched) {
|
public void setTo_be_enriched(List<Author> to_be_enriched) {
|
||||||
this.to_be_enriched = to_be_enriched;
|
this.to_be_enriched = to_be_enriched;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Author getWith_enricheing_content() {
|
public Author getWith_enricheing_content() {
|
||||||
return with_enricheing_content;
|
return with_enricheing_content;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setWith_enricheing_content(Author with_enricheing_content) {
|
public void setWith_enricheing_content(Author with_enricheing_content) {
|
||||||
this.with_enricheing_content = with_enricheing_content;
|
this.with_enricheing_content = with_enricheing_content;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static AuthorAssoc newInstance(Author a){
|
public static AuthorAssoc newInstance(Author a) {
|
||||||
AuthorAssoc ret = new AuthorAssoc();
|
AuthorAssoc ret = new AuthorAssoc();
|
||||||
ret.score = 0.0;
|
ret.score = 0.0;
|
||||||
ret.to_be_enriched = new ArrayList<>();
|
ret.to_be_enriched = new ArrayList<>();
|
||||||
ret.with_enricheing_content = a;
|
ret.with_enricheing_content = a;
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,14 +6,12 @@ import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import com.wcohen.ss.Jaccard;
|
import com.wcohen.ss.Jaccard;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
|
||||||
|
|
||||||
import com.wcohen.ss.JaroWinkler;
|
import com.wcohen.ss.JaroWinkler;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -49,258 +47,250 @@ import scala.Tuple2;
|
||||||
|
|
||||||
public class DoiBoostAuthorMerger {
|
public class DoiBoostAuthorMerger {
|
||||||
|
|
||||||
|
public static List<Author> merge(List<List<Author>> authors, Boolean crossref) {
|
||||||
|
|
||||||
public static List<Author> merge(List<List<Author>> authors, Boolean crossref) {
|
Iterator<List<Author>> it = authors.iterator();
|
||||||
|
List<Author> author = it.next();
|
||||||
|
|
||||||
Iterator<List<Author>> it = authors.iterator();
|
while (it.hasNext()) {
|
||||||
List<Author> author = it.next();
|
List<Author> autList = it.next();
|
||||||
|
Tuple2<List<Author>, Boolean> tmp = mergeAuthor(author, autList, crossref);
|
||||||
|
author = tmp._1();
|
||||||
|
crossref = tmp._2();
|
||||||
|
}
|
||||||
|
|
||||||
while (it.hasNext()){
|
return author;
|
||||||
List<Author> autList = it.next();
|
|
||||||
Tuple2<List<Author>, Boolean> tmp = mergeAuthor(author, autList, crossref);
|
|
||||||
author = tmp._1();
|
|
||||||
crossref = tmp._2();
|
|
||||||
}
|
|
||||||
|
|
||||||
return author;
|
}
|
||||||
|
|
||||||
}
|
// If we have a list of authors coming from crossref we take that and we enrich it
|
||||||
|
// If we do not have a list of authors coming from crossref we enrich the longest at each step
|
||||||
|
public static Tuple2<List<Author>, Boolean> mergeAuthor(final List<Author> baseAuthor,
|
||||||
|
final List<Author> otherAuthor,
|
||||||
|
final Boolean crossref) {
|
||||||
|
|
||||||
//If we have a list of authors coming from crossref we take that and we enrich it
|
if (baseAuthor == null || baseAuthor.size() == 0)
|
||||||
//If we do not have a list of authors coming from crossref we enrich the longest at each step
|
return new Tuple2<>(otherAuthor, false);
|
||||||
public static Tuple2<List<Author>, Boolean> mergeAuthor(final List<Author> baseAuthor, final List<Author> otherAuthor,
|
if (otherAuthor == null || otherAuthor.size() == 0)
|
||||||
final Boolean crossref) {
|
return new Tuple2<>(baseAuthor, crossref);
|
||||||
|
|
||||||
if(baseAuthor == null || baseAuthor.size() == 0)
|
if (crossref) {
|
||||||
return new Tuple2<>(otherAuthor, false);
|
enrichPidFromList(baseAuthor, otherAuthor);
|
||||||
if(otherAuthor == null || otherAuthor.size() == 0)
|
return new Tuple2<>(baseAuthor, true);
|
||||||
return new Tuple2<>(baseAuthor, crossref);
|
} else if (baseAuthor.size() > otherAuthor.size()) {
|
||||||
|
enrichPidFromList(baseAuthor, otherAuthor);
|
||||||
|
return new Tuple2<>(baseAuthor, false);
|
||||||
|
} else {
|
||||||
|
enrichPidFromList(otherAuthor, baseAuthor);
|
||||||
|
return new Tuple2<>(otherAuthor, false);
|
||||||
|
}
|
||||||
|
|
||||||
if(crossref) {
|
}
|
||||||
enrichPidFromList(baseAuthor, otherAuthor);
|
|
||||||
return new Tuple2<>(baseAuthor, true);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
if (baseAuthor.size() > otherAuthor.size()){
|
|
||||||
enrichPidFromList(baseAuthor, otherAuthor);
|
|
||||||
return new Tuple2<>(baseAuthor, false);
|
|
||||||
}else{
|
|
||||||
enrichPidFromList(otherAuthor, baseAuthor);
|
|
||||||
return new Tuple2<>(otherAuthor, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
// valutare se questa cosa va invertita: dovrei prendere per ogni enriching author quello che piu' gli somiglia
|
||||||
|
// nella base list non il contrario
|
||||||
|
private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
|
||||||
|
|
||||||
|
// search authors having identifiers in the enrich list
|
||||||
|
final List<Author> authorsWithPids = enrich
|
||||||
|
.stream()
|
||||||
|
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
//valutare se questa cosa va invertita: dovrei prendere per ogni enriching author quello che piu' gli somiglia
|
Map<String, AuthorAssoc> assocMap = authorsWithPids
|
||||||
//nella base list non il contrario
|
.stream()
|
||||||
private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
|
.map(
|
||||||
|
a -> new Tuple2<>(DHPUtils.md5(a.getFullname()), AuthorAssoc.newInstance(a)))
|
||||||
|
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
|
||||||
|
|
||||||
//search authors having identifiers in the enrich list
|
Map<String, Tuple2<String, Tuple2<List<String>, Double>>> baseAssoc = new HashMap<>();
|
||||||
final List<Author> authorsWithPids = enrich
|
|
||||||
.stream()
|
|
||||||
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
|
|
||||||
Map<String, AuthorAssoc> assocMap = authorsWithPids
|
// for each author in the base list, we search the best enriching match
|
||||||
.stream()
|
// we create the association (author, list of (enriching author, similatiry score))
|
||||||
.map(
|
base
|
||||||
a -> new Tuple2<>(DHPUtils.md5(a.getFullname()), AuthorAssoc.newInstance(a)))
|
.stream()
|
||||||
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
|
.map(
|
||||||
|
a -> new Tuple2<>(a,
|
||||||
|
authorsWithPids
|
||||||
|
.stream()
|
||||||
|
.map(e -> new Tuple2<>(e, sim(a, e)))
|
||||||
|
.filter(t2 -> t2._2() > 0.0)
|
||||||
|
.collect(Collectors.toList())))
|
||||||
|
.forEach(t2 -> {
|
||||||
|
String base_name = t2._1().getFullname();
|
||||||
|
String base_name_md5 = DHPUtils.md5(t2._1().getFullname());
|
||||||
|
Double max_score = 0.0;
|
||||||
|
List<String> enrich_name = new ArrayList();
|
||||||
|
for (Tuple2<Author, Double> t : t2._2()) {
|
||||||
|
// we get the fullname of the enriching
|
||||||
|
String mapEntry = DHPUtils.md5(t._1().getFullname());
|
||||||
|
|
||||||
Map<String, Tuple2<String,Tuple2<List<String>, Double>>> baseAssoc = new HashMap<>();
|
if (t._2() > max_score) {
|
||||||
|
max_score = t._2();
|
||||||
|
enrich_name = new ArrayList();
|
||||||
|
enrich_name.add(mapEntry);
|
||||||
|
} else if (t._2() > 0 && t._2().equals(max_score)) {
|
||||||
|
enrich_name.add(mapEntry);
|
||||||
|
}
|
||||||
|
|
||||||
|
AuthorAssoc aa = assocMap.get(mapEntry);
|
||||||
|
if (aa.getScore() < t._2()) {
|
||||||
|
aa.setScore(t._2());
|
||||||
|
aa.setTo_be_enriched(new ArrayList<>());
|
||||||
|
aa.getTo_be_enriched().add(t2._1());
|
||||||
|
} else {
|
||||||
|
aa.getTo_be_enriched().add(t2._1());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (max_score > 0) {
|
||||||
|
baseAssoc.put(base_name_md5, new Tuple2(base_name, new Tuple2<>(enrich_name, max_score)));
|
||||||
|
}
|
||||||
|
|
||||||
//for each author in the base list, we search the best enriching match
|
});
|
||||||
//we create the association (author, list of (enriching author, similatiry score))
|
List<Tuple2<Double, Tuple2<String, List<String>>>> list = baseAssoc.keySet().stream().map(k -> {
|
||||||
base.stream()
|
Tuple2<String, Tuple2<List<String>, Double>> map_entry = baseAssoc.get(k);
|
||||||
.map(a ->
|
return new Tuple2<>(map_entry._2()._2(), new Tuple2<>(map_entry._1(), map_entry._2()._1()));
|
||||||
new Tuple2<>(a,
|
})
|
||||||
authorsWithPids.stream()
|
.collect(Collectors.toList());
|
||||||
.map(e -> new Tuple2<>(e, sim(a, e)))
|
list.sort(Comparator.comparing(e -> e._1()));
|
||||||
.filter(t2 -> t2._2() > 0.0)
|
// ordino per max score la baseAssoc
|
||||||
.collect(Collectors.toList()))
|
for (int i = list.size() - 1; i >= 0; i--) {
|
||||||
)
|
Tuple2<Double, Tuple2<String, List<String>>> tmp = list.get(i);
|
||||||
.forEach(t2 -> {
|
List<String> entries = tmp._2()._2();
|
||||||
String base_name = t2._1().getFullname();
|
// se len = 1 => ho un solo e che con questo a ha max score
|
||||||
String base_name_md5 = DHPUtils.md5(t2._1().getFullname());
|
if (entries.size() == 1) {
|
||||||
Double max_score = 0.0;
|
if (assocMap.containsKey(entries.get(0))) {
|
||||||
List<String> enrich_name = new ArrayList();
|
enrichAuthor(assocMap.get(entries.get(0)));
|
||||||
for (Tuple2<Author, Double> t : t2._2()) {
|
assocMap.remove(entries.get(0));
|
||||||
//we get the fullname of the enriching
|
}
|
||||||
String mapEntry = DHPUtils.md5(t._1().getFullname());
|
} else {
|
||||||
|
String author_fullname = tmp._2()._1();
|
||||||
|
long commonWords = 0;
|
||||||
|
String enriching = null;
|
||||||
|
for (String entry : entries) {
|
||||||
|
if (assocMap.containsKey(entry)) {
|
||||||
|
long words = getCommonWords(
|
||||||
|
normalize(entry),
|
||||||
|
normalize(author_fullname));
|
||||||
|
if (words > commonWords) {
|
||||||
|
commonWords = words;
|
||||||
|
enriching = entry;
|
||||||
|
}
|
||||||
|
if (words == commonWords) {
|
||||||
|
enriching = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if(t._2() > max_score){
|
}
|
||||||
max_score = t._2();
|
if (enriching != null) {
|
||||||
enrich_name = new ArrayList();
|
enrichAuthor(assocMap.get(entries.get(0)));
|
||||||
enrich_name.add(mapEntry);
|
assocMap.remove(entries.get(0));
|
||||||
}else if(t._2() > 0 && t._2().equals(max_score)){
|
}
|
||||||
enrich_name.add(mapEntry);
|
// TODO pensare ad un modo per arricchire con il miglior e questo autore
|
||||||
}
|
// Siamo nel caso in cui un autore ha piu' di un e con lo stesso similarity score
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// assocMap.keySet().forEach(k -> enrichAuthor(assocMap.get(k)));
|
||||||
|
|
||||||
AuthorAssoc aa = assocMap.get(mapEntry);
|
}
|
||||||
if(aa.getScore() < t._2()){
|
|
||||||
aa.setScore(t._2());
|
|
||||||
aa.setTo_be_enriched(new ArrayList<>());
|
|
||||||
aa.getTo_be_enriched().add(t2._1());
|
|
||||||
}else {
|
|
||||||
aa.getTo_be_enriched().add(t2._1());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(max_score > 0){
|
|
||||||
baseAssoc.put(base_name_md5, new Tuple2(base_name, new Tuple2<>(enrich_name, max_score)));
|
|
||||||
}
|
|
||||||
|
|
||||||
});
|
private static long getCommonWords(List<String> fullEnrich, List<String> fullEnriching) {
|
||||||
List<Tuple2<Double, Tuple2<String, List<String>>>> list = baseAssoc.keySet().stream().map(k -> {
|
return fullEnrich.stream().filter(w -> fullEnriching.contains(w)).count();
|
||||||
Tuple2<String, Tuple2<List<String>, Double>> map_entry = baseAssoc.get(k);
|
}
|
||||||
return new Tuple2<>(map_entry._2()._2(), new Tuple2<>(map_entry._1(), map_entry._2()._1()));
|
|
||||||
})
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
list.sort(Comparator.comparing(e -> e._1()));
|
|
||||||
//ordino per max score la baseAssoc
|
|
||||||
for (int i = list.size() -1 ; i>=0 ; i-- ){
|
|
||||||
Tuple2<Double, Tuple2<String, List<String>>> tmp = list.get(i);
|
|
||||||
List<String> entries = tmp._2()._2();
|
|
||||||
//se len = 1 => ho un solo e che con questo a ha max score
|
|
||||||
if(entries.size() == 1){
|
|
||||||
if(assocMap.containsKey(entries.get(0))) {
|
|
||||||
enrichAuthor(assocMap.get(entries.get(0)));
|
|
||||||
assocMap.remove(entries.get(0));
|
|
||||||
}
|
|
||||||
}else{
|
|
||||||
String author_fullname = tmp._2()._1();
|
|
||||||
long commonWords = 0;
|
|
||||||
String enriching = null;
|
|
||||||
for(String entry : entries){
|
|
||||||
if (assocMap.containsKey(entry)){
|
|
||||||
long words = getCommonWords(normalize(entry),
|
|
||||||
normalize(author_fullname));
|
|
||||||
if (words > commonWords){
|
|
||||||
commonWords = words;
|
|
||||||
enriching = entry;
|
|
||||||
}
|
|
||||||
if(words == commonWords){
|
|
||||||
enriching = null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
private static void enrichAuthor(Author enrich, Author enriching) {
|
||||||
if(enriching != null){
|
// verify if some of the words in the fullname are contained in the other
|
||||||
enrichAuthor(assocMap.get(entries.get(0)));
|
// get normalized fullname
|
||||||
assocMap.remove(entries.get(0));
|
|
||||||
}
|
|
||||||
//TODO pensare ad un modo per arricchire con il miglior e questo autore
|
|
||||||
//Siamo nel caso in cui un autore ha piu' di un e con lo stesso similarity score
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// assocMap.keySet().forEach(k -> enrichAuthor(assocMap.get(k)));
|
|
||||||
|
|
||||||
|
long commonWords = getCommonWords(
|
||||||
|
normalize(enrich.getFullname()),
|
||||||
|
normalize(enriching.getFullname()));
|
||||||
|
if (commonWords > 0) {
|
||||||
|
if (enrich.getPid() == null) {
|
||||||
|
enrich.setPid(new ArrayList<>());
|
||||||
|
}
|
||||||
|
Set<String> aPids = enrich.getPid().stream().map(p -> pidToComparableString(p)).collect(Collectors.toSet());
|
||||||
|
enriching.getPid().forEach(p -> {
|
||||||
|
if (!aPids.contains(pidToComparableString(p))) {
|
||||||
|
enrich.getPid().add(p);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (enrich.getAffiliation() == null) {
|
||||||
|
if (enriching.getAffiliation() != null) {
|
||||||
|
enrich.setAffiliation(enriching.getAffiliation());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static long getCommonWords(List<String> fullEnrich, List<String> fullEnriching){
|
// Verify the number of words in common. The one that has more, wins. If the number of words in common are the same
|
||||||
return fullEnrich.stream().filter( w -> fullEnriching.contains(w)).count();
|
// we
|
||||||
}
|
// enrich no author
|
||||||
|
private static void enrichAuthor(AuthorAssoc authorAssoc) {
|
||||||
|
if (authorAssoc.getTo_be_enriched().size() == 1) {
|
||||||
|
enrichAuthor(authorAssoc.getTo_be_enriched().get(0), authorAssoc.getWith_enricheing_content());
|
||||||
|
} else {
|
||||||
|
long common = 0;
|
||||||
|
List<Author> selected = new ArrayList<>();
|
||||||
|
for (Author a : authorAssoc.getTo_be_enriched()) {
|
||||||
|
long current_common = getCommonWords(
|
||||||
|
normalize(a.getFullname()),
|
||||||
|
normalize(authorAssoc.getWith_enricheing_content().getFullname()));
|
||||||
|
if (current_common > common) {
|
||||||
|
common = current_common;
|
||||||
|
selected = new ArrayList<>();
|
||||||
|
selected.add(a);
|
||||||
|
} else if (current_common == common) {
|
||||||
|
selected.add(a);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (selected.size() == 1) {
|
||||||
|
enrichAuthor(selected.get(0), authorAssoc.getWith_enricheing_content());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
private static void enrichAuthor(Author enrich, Author enriching){
|
public static String pidToComparableString(StructuredProperty pid) {
|
||||||
//verify if some of the words in the fullname are contained in the other
|
return (pid.getQualifier() != null
|
||||||
//get normalized fullname
|
? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
|
||||||
|
: "")
|
||||||
|
+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
|
||||||
|
}
|
||||||
|
|
||||||
long commonWords = getCommonWords(normalize(enrich.getFullname()),
|
private static Double sim(Author a, Author b) {
|
||||||
normalize(enriching.getFullname()));
|
return new Jaccard()
|
||||||
if(commonWords > 0 ){
|
.score(normalizeString(a.getFullname()), normalizeString(b.getFullname()));
|
||||||
if(enrich.getPid() == null){
|
|
||||||
enrich.setPid(new ArrayList<>());
|
|
||||||
}
|
|
||||||
Set<String> aPids = enrich.getPid().stream().map(p -> pidToComparableString(p)).collect(Collectors.toSet());
|
|
||||||
enriching.getPid().forEach(p -> {
|
|
||||||
if (!aPids.contains(pidToComparableString(p))){
|
|
||||||
enrich.getPid().add(p);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
if (enrich.getAffiliation() == null){
|
|
||||||
if (enriching.getAffiliation() != null){
|
|
||||||
enrich.setAffiliation(enriching.getAffiliation());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
private static String normalizeString(String fullname) {
|
||||||
|
return String.join(" ", normalize(fullname));
|
||||||
|
}
|
||||||
|
|
||||||
//Verify the number of words in common. The one that has more, wins. If the number of words in common are the same we
|
private static List<String> normalize(final String s) {
|
||||||
//enrich no author
|
String[] normalized = nfd(s)
|
||||||
private static void enrichAuthor(AuthorAssoc authorAssoc) {
|
.replaceAll("[^\\p{ASCII}]", "")
|
||||||
if (authorAssoc.getTo_be_enriched().size() == 1){
|
.toLowerCase()
|
||||||
enrichAuthor(authorAssoc.getTo_be_enriched().get(0), authorAssoc.getWith_enricheing_content());
|
// do not compact the regexes in a single expression, would cause StackOverflowError
|
||||||
}else{
|
// in case
|
||||||
long common = 0;
|
// of large input strings
|
||||||
List<Author> selected = new ArrayList<>() ;
|
.replaceAll("(\\W)+", " ")
|
||||||
for(Author a : authorAssoc.getTo_be_enriched()){
|
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
|
||||||
long current_common = getCommonWords(normalize(a.getFullname()),
|
.replaceAll("(\\p{Punct})+", " ")
|
||||||
normalize(authorAssoc.getWith_enricheing_content().getFullname()));
|
.replaceAll("(\\d)+", " ")
|
||||||
if (current_common > common){
|
.replaceAll("(\\n)+", " ")
|
||||||
common = current_common;
|
.trim()
|
||||||
selected = new ArrayList<>();
|
.split(" ");
|
||||||
selected.add(a);
|
|
||||||
}else if(current_common == common){
|
|
||||||
selected.add(a);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (selected.size() == 1){
|
|
||||||
enrichAuthor(selected.get(0), authorAssoc.getWith_enricheing_content());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
Arrays.sort(normalized);
|
||||||
|
|
||||||
|
return Arrays.asList(normalized);
|
||||||
|
|
||||||
public static String pidToComparableString(StructuredProperty pid) {
|
}
|
||||||
return (pid.getQualifier() != null
|
|
||||||
? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
|
|
||||||
: "")
|
|
||||||
+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
private static String nfd(final String s) {
|
||||||
|
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||||
|
}
|
||||||
private static Double sim(Author a, Author b) {
|
|
||||||
return new Jaccard()
|
|
||||||
.score(normalizeString(a.getFullname()), normalizeString(b.getFullname()));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String normalizeString(String fullname) {
|
|
||||||
return String.join(" ", normalize(fullname));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static List<String> normalize(final String s) {
|
|
||||||
String[] normalized = nfd(s)
|
|
||||||
.replaceAll("[^\\p{ASCII}]", "")
|
|
||||||
.toLowerCase()
|
|
||||||
// do not compact the regexes in a single expression, would cause StackOverflowError
|
|
||||||
// in case
|
|
||||||
// of large input strings
|
|
||||||
.replaceAll("(\\W)+", " ")
|
|
||||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
|
|
||||||
.replaceAll("(\\p{Punct})+", " ")
|
|
||||||
.replaceAll("(\\d)+", " ")
|
|
||||||
.replaceAll("(\\n)+", " ")
|
|
||||||
.trim()
|
|
||||||
.split(" ");
|
|
||||||
|
|
||||||
Arrays.sort(normalized);
|
|
||||||
|
|
||||||
return Arrays.asList(normalized);
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String nfd(final String s) {
|
|
||||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue