2021-12-27 17:35:02 +01:00
|
|
|
package eu.dnetlib.pace.tree;
|
|
|
|
|
|
|
|
import com.google.common.collect.Iterables;
|
|
|
|
import eu.dnetlib.pace.config.Config;
|
|
|
|
import eu.dnetlib.pace.model.Field;
|
|
|
|
import eu.dnetlib.pace.model.FieldList;
|
|
|
|
import eu.dnetlib.pace.model.Person;
|
|
|
|
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
|
|
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
2022-01-13 11:58:28 +01:00
|
|
|
import com.wcohen.ss.AbstractStringDistance;
|
2021-12-27 17:35:02 +01:00
|
|
|
|
|
|
|
import java.util.Comparator;
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.Map;
|
|
|
|
import java.util.function.Function;
|
|
|
|
import java.util.stream.Collectors;
|
|
|
|
import java.util.stream.Stream;
|
|
|
|
|
|
|
|
@ComparatorClass("authorsMatch")
|
|
|
|
public class AuthorsMatch extends AbstractComparator {
|
|
|
|
|
|
|
|
Map<String, String> params;
|
|
|
|
|
|
|
|
private double SURNAME_THRESHOLD;
|
|
|
|
private double NAME_THRESHOLD;
|
|
|
|
private double FULLNAME_THRESHOLD;
|
|
|
|
private String MODE; //full or surname
|
2022-03-08 16:49:28 +01:00
|
|
|
private int SIZE_THRESHOLD;
|
2023-04-04 17:10:37 +02:00
|
|
|
private String TYPE; //count or percentage
|
2022-01-13 11:58:28 +01:00
|
|
|
private int common;
|
2021-12-27 17:35:02 +01:00
|
|
|
|
|
|
|
public AuthorsMatch(Map<String, String> params){
|
|
|
|
super(params, new com.wcohen.ss.JaroWinkler());
|
|
|
|
this.params = params;
|
|
|
|
|
|
|
|
MODE = params.getOrDefault("mode", "full");
|
|
|
|
SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95"));
|
|
|
|
NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95"));
|
|
|
|
FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9"));
|
2022-03-08 16:49:28 +01:00
|
|
|
SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20"));
|
2023-04-04 17:10:37 +02:00
|
|
|
TYPE = params.getOrDefault("type", "percentage");
|
2022-01-13 11:58:28 +01:00
|
|
|
common = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
protected AuthorsMatch(double w, AbstractStringDistance ssalgo) {
|
|
|
|
super(w, ssalgo);
|
2021-12-27 17:35:02 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public double compare(final Field a, final Field b, final Config conf) {
|
|
|
|
|
|
|
|
if (a.isEmpty() || b.isEmpty())
|
|
|
|
return -1;
|
|
|
|
|
2023-04-17 11:06:27 +02:00
|
|
|
if (((FieldList) a).size() > SIZE_THRESHOLD || ((FieldList) b).size() > SIZE_THRESHOLD)
|
2022-03-08 16:49:28 +01:00
|
|
|
return 1.0;
|
|
|
|
|
2021-12-27 17:35:02 +01:00
|
|
|
List<Person> aList = ((FieldList) a).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList());
|
|
|
|
List<Person> bList = ((FieldList) b).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList());
|
|
|
|
|
2022-01-13 11:58:28 +01:00
|
|
|
common = 0;
|
|
|
|
//compare each element of List1 with each element of List2
|
2021-12-27 17:35:02 +01:00
|
|
|
for (Person p1 : aList)
|
2022-01-13 11:58:28 +01:00
|
|
|
|
|
|
|
for (Person p2 : bList) {
|
|
|
|
|
|
|
|
//both persons are inaccurate
|
|
|
|
if (!p1.isAccurate() && !p2.isAccurate()) {
|
|
|
|
//compare just normalized fullnames
|
2022-01-13 17:20:20 +01:00
|
|
|
String fullname1 = normalization(p1.getNormalisedFullname().isEmpty()? p1.getOriginal() : p1.getNormalisedFullname());
|
|
|
|
String fullname2 = normalization(p2.getNormalisedFullname().isEmpty()? p2.getOriginal() : p2.getNormalisedFullname());
|
|
|
|
|
|
|
|
if (ssalgo.score(fullname1, fullname2) > FULLNAME_THRESHOLD) {
|
2021-12-27 17:35:02 +01:00
|
|
|
common += 1;
|
2022-01-13 11:58:28 +01:00
|
|
|
break;
|
|
|
|
}
|
2021-12-27 17:35:02 +01:00
|
|
|
}
|
2022-01-13 11:58:28 +01:00
|
|
|
|
|
|
|
//one person is inaccurate
|
|
|
|
if (p1.isAccurate() ^ p2.isAccurate()) {
|
|
|
|
//prepare data
|
2022-03-09 12:53:09 +01:00
|
|
|
//data for the accurate person
|
2022-01-13 17:20:20 +01:00
|
|
|
String name = normalization(p1.isAccurate()? p1.getNormalisedFirstName() : p2.getNormalisedFirstName());
|
2022-03-09 12:53:09 +01:00
|
|
|
String surname = normalization(p1.isAccurate()? p1.getNormalisedSurname() : p2.getNormalisedSurname());
|
2022-01-13 11:58:28 +01:00
|
|
|
|
2022-03-09 12:53:09 +01:00
|
|
|
//data for the inaccurate person
|
2022-01-13 17:20:20 +01:00
|
|
|
String fullname = normalization(
|
2022-03-09 12:53:09 +01:00
|
|
|
p1.isAccurate() ? ((p2.getNormalisedFullname().isEmpty()) ? p2.getOriginal() : p2.getNormalisedFullname()) : (p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname())
|
2022-01-13 17:20:20 +01:00
|
|
|
);
|
2022-01-13 11:58:28 +01:00
|
|
|
|
|
|
|
if (fullname.contains(surname)) {
|
|
|
|
if (MODE.equals("full")) {
|
|
|
|
if (fullname.contains(name)) {
|
|
|
|
common += 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else { //MODE equals "surname"
|
|
|
|
common += 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2021-12-27 17:35:02 +01:00
|
|
|
}
|
|
|
|
|
2022-01-13 11:58:28 +01:00
|
|
|
//both persons are accurate
|
|
|
|
if (p1.isAccurate() && p2.isAccurate()) {
|
|
|
|
|
|
|
|
if (compareSurname(p1, p2)) {
|
|
|
|
if (MODE.equals("full")) {
|
|
|
|
if(compareFirstname(p1, p2)) {
|
|
|
|
common += 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else { //MODE equals "surname"
|
|
|
|
common += 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2021-12-27 17:35:02 +01:00
|
|
|
|
2022-01-13 11:58:28 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
//normalization factor to compute the score
|
|
|
|
int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common);
|
2021-12-27 17:35:02 +01:00
|
|
|
|
2023-04-04 17:10:37 +02:00
|
|
|
if(TYPE.equals("percentage")) {
|
|
|
|
return (double) common / normFactor;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
return (double) common;
|
|
|
|
}
|
2022-01-13 11:58:28 +01:00
|
|
|
}
|
2021-12-27 17:35:02 +01:00
|
|
|
|
2022-01-13 11:58:28 +01:00
|
|
|
public boolean compareSurname(Person p1, Person p2) {
|
|
|
|
return ssalgo.score(normalization(p1.getNormalisedSurname()), normalization(p2.getNormalisedSurname())) > SURNAME_THRESHOLD;
|
2021-12-27 17:35:02 +01:00
|
|
|
}
|
|
|
|
|
2022-01-13 11:58:28 +01:00
|
|
|
public boolean compareFirstname(Person p1, Person p2) {
|
|
|
|
|
|
|
|
if(p1.getNormalisedFirstName().length()<=2 || p2.getNormalisedFirstName().length()<=2) {
|
|
|
|
if (firstLC(p1.getNormalisedFirstName()).equals(firstLC(p2.getNormalisedFirstName())))
|
|
|
|
return true;
|
|
|
|
}
|
2021-12-27 17:35:02 +01:00
|
|
|
|
2022-01-13 11:58:28 +01:00
|
|
|
return ssalgo.score(normalization(p1.getNormalisedFirstName()), normalization(p2.getNormalisedFirstName())) > NAME_THRESHOLD;
|
|
|
|
}
|
2021-12-27 17:35:02 +01:00
|
|
|
|
2022-01-13 11:58:28 +01:00
|
|
|
public String normalization(String s) {
|
|
|
|
return normalize(utf8(cleanup(s)));
|
2021-12-27 17:35:02 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|