[Author Merger DoiBoost] add last part to consider also author->enrich map
This commit is contained in:
parent
f100dc5880
commit
1dd15ee2f2
|
@ -94,8 +94,6 @@ public class DoiBoostAuthorMerger {
|
||||||
|
|
||||||
//valutare se questa cosa va invertita: dovrei prendere per ogni enriching author quello che piu' gli somiglia
|
//valutare se questa cosa va invertita: dovrei prendere per ogni enriching author quello che piu' gli somiglia
|
||||||
//nella base list non il contrario
|
//nella base list non il contrario
|
||||||
|
|
||||||
|
|
||||||
private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
|
private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
|
||||||
|
|
||||||
//search authors having identifiers in the enrich list
|
//search authors having identifiers in the enrich list
|
||||||
|
@ -110,7 +108,7 @@ public class DoiBoostAuthorMerger {
|
||||||
a -> new Tuple2<>(DHPUtils.md5(a.getFullname()), AuthorAssoc.newInstance(a)))
|
a -> new Tuple2<>(DHPUtils.md5(a.getFullname()), AuthorAssoc.newInstance(a)))
|
||||||
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
|
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
|
||||||
|
|
||||||
Map<String, Tuple2<List<String>, Double>> baseAssoc = new HashMap<>();
|
Map<String, Tuple2<String,Tuple2<List<String>, Double>>> baseAssoc = new HashMap<>();
|
||||||
|
|
||||||
|
|
||||||
//for each author in the base list, we search the best enriching match
|
//for each author in the base list, we search the best enriching match
|
||||||
|
@ -124,7 +122,8 @@ public class DoiBoostAuthorMerger {
|
||||||
.collect(Collectors.toList()))
|
.collect(Collectors.toList()))
|
||||||
)
|
)
|
||||||
.forEach(t2 -> {
|
.forEach(t2 -> {
|
||||||
String base_name = DHPUtils.md5(t2._1().getFullname());
|
String base_name = t2._1().getFullname();
|
||||||
|
String base_name_md5 = DHPUtils.md5(t2._1().getFullname());
|
||||||
Double max_score = 0.0;
|
Double max_score = 0.0;
|
||||||
List<String> enrich_name = new ArrayList();
|
List<String> enrich_name = new ArrayList();
|
||||||
for (Tuple2<Author, Double> t : t2._2()) {
|
for (Tuple2<Author, Double> t : t2._2()) {
|
||||||
|
@ -149,12 +148,53 @@ public class DoiBoostAuthorMerger {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(max_score > 0){
|
if(max_score > 0){
|
||||||
baseAssoc.put(base_name, new Tuple2<>(enrich_name, max_score));
|
baseAssoc.put(base_name_md5, new Tuple2(base_name, new Tuple2<>(enrich_name, max_score)));
|
||||||
}
|
}
|
||||||
|
|
||||||
});
|
});
|
||||||
|
List<Tuple2<Double, Tuple2<String, List<String>>>> list = baseAssoc.keySet().stream().map(k -> {
|
||||||
|
Tuple2<String, Tuple2<List<String>, Double>> map_entry = baseAssoc.get(k);
|
||||||
|
return new Tuple2<>(map_entry._2()._2(), new Tuple2<>(map_entry._1(), map_entry._2()._1()));
|
||||||
|
})
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
list.sort(Comparator.comparing(e -> e._1()));
|
||||||
|
//ordino per max score la baseAssoc
|
||||||
|
for (int i = list.size() -1 ; i>=0 ; i-- ){
|
||||||
|
Tuple2<Double, Tuple2<String, List<String>>> tmp = list.get(i);
|
||||||
|
List<String> entries = tmp._2()._2();
|
||||||
|
//se len = 1 => ho un solo e che con questo a ha max score
|
||||||
|
if(entries.size() == 1){
|
||||||
|
if(assocMap.containsKey(entries.get(0))) {
|
||||||
|
enrichAuthor(assocMap.get(entries.get(0)));
|
||||||
|
assocMap.remove(entries.get(0));
|
||||||
|
}
|
||||||
|
}else{
|
||||||
|
String author_fullname = tmp._2()._1();
|
||||||
|
long commonWords = 0;
|
||||||
|
String enriching = null;
|
||||||
|
for(String entry : entries){
|
||||||
|
if (assocMap.containsKey(entry)){
|
||||||
|
long words = getCommonWords(normalize(entry),
|
||||||
|
normalize(author_fullname));
|
||||||
|
if (words > commonWords){
|
||||||
|
commonWords = words;
|
||||||
|
enriching = entry;
|
||||||
|
}
|
||||||
|
if(words == commonWords){
|
||||||
|
enriching = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
assocMap.keySet().forEach(k -> enrichAuthor(assocMap.get(k)));
|
}
|
||||||
|
if(enriching != null){
|
||||||
|
enrichAuthor(assocMap.get(entries.get(0)));
|
||||||
|
assocMap.remove(entries.get(0));
|
||||||
|
}
|
||||||
|
//TODO pensare ad un modo per arricchire con il miglior e questo autore
|
||||||
|
//Siamo nel caso in cui un autore ha piu' di un e con lo stesso similarity score
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// assocMap.keySet().forEach(k -> enrichAuthor(assocMap.get(k)));
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue