reimplementation of the sim between two authors. now it takes into account both name and surname. threshold incremented to 1.0 if the name is too short

This commit is contained in:
miconis 2020-05-22 17:24:57 +02:00
parent 70389b0a30
commit 0fd0c7d725
3 changed files with 40 additions and 18 deletions

View File

@ -20,12 +20,7 @@ public class AuthorMerger {
public static List<Author> merge(List<List<Author>> authors) {
authors.sort(new Comparator<List<Author>>() {
@Override
public int compare(List<Author> o1, List<Author> o2) {
return -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2));
}
});
authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)));
List<Author> author = new ArrayList<>();
@ -86,20 +81,28 @@ public class AuthorMerger {
.stream()
.map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
.max(Comparator.comparing(Tuple2::_1));
if (simAuthor.isPresent() && simAuthor.get()._1() > THRESHOLD) {
Author r = simAuthor.get()._2();
if (r.getPid() == null) {
r.setPid(new ArrayList<>());
if(simAuthor.isPresent()) {
double th = THRESHOLD;
//increase the threshold if the surname is too short
if (simAuthor.get()._2().getSurname() != null && simAuthor.get()._2().getSurname().length()<=3)
th = 0.99;
if (simAuthor.get()._1() > th) {
Author r = simAuthor.get()._2();
if (r.getPid() == null) {
r.setPid(new ArrayList<>());
}
r.getPid().add(a._1());
}
r.getPid().add(a._1());
}
});
}
public static String pidToComparableString(StructuredProperty pid) {
return (pid.getQualifier() != null
? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
: "") + (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
return (pid.getQualifier() != null ?
pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : "" : "")
+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
}
public static int countAuthorsPids(List<Author> authors) {
@ -120,12 +123,14 @@ public class AuthorMerger {
final Person pa = parse(a);
final Person pb = parse(b);
//if both are accurate (e.g. they have name and surname)
if (pa.isAccurate() & pb.isAccurate()) {
return new JaroWinkler()
.score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString()));
return
new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString()))*0.5
+ new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString()))*0.5;
} else {
return new JaroWinkler()
.score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
return
new JaroWinkler().score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
}
}

View File

@ -21,6 +21,7 @@ import scala.Tuple2;
public class EntityMergerTest implements Serializable {
List<Tuple2<String, Publication>> publications;
List<Tuple2<String, Publication>> publications2;
String testEntityBasePath;
DataInfo dataInfo;
@ -36,6 +37,7 @@ public class EntityMergerTest implements Serializable {
.getAbsolutePath();
publications = readSample(testEntityBasePath + "/publication_merge.json", Publication.class);
publications2 = readSample(testEntityBasePath + "/publication_merge2.json", Publication.class);
pub_top = getTopPub(publications);
@ -90,6 +92,17 @@ public class EntityMergerTest implements Serializable {
assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4);
}
@Test
public void publicationMergerTest2() throws InstantiationException, IllegalAccessException, IOException {
Publication pub_merged = DedupRecordFactory
.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
assertEquals(pub_merged.getAuthor().size(), 27);
// insert assertions here
}
public DataInfo setDI() {
DataInfo dataInfo = new DataInfo();
dataInfo.setTrust("0.9");

File diff suppressed because one or more lines are too long