1
0
Fork 0

reimplementation of the sim between two authors. now it takes into account both name and surname. threshold incremented to 1.0 if the name is too short

This commit is contained in:
miconis 2020-05-22 17:24:57 +02:00
parent 70389b0a30
commit 0fd0c7d725
3 changed files with 40 additions and 18 deletions

View File

@ -20,12 +20,7 @@ public class AuthorMerger {
public static List<Author> merge(List<List<Author>> authors) { public static List<Author> merge(List<List<Author>> authors) {
authors.sort(new Comparator<List<Author>>() { authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)));
@Override
public int compare(List<Author> o1, List<Author> o2) {
return -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2));
}
});
List<Author> author = new ArrayList<>(); List<Author> author = new ArrayList<>();
@ -86,20 +81,28 @@ public class AuthorMerger {
.stream() .stream()
.map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) .map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
.max(Comparator.comparing(Tuple2::_1)); .max(Comparator.comparing(Tuple2::_1));
if (simAuthor.isPresent() && simAuthor.get()._1() > THRESHOLD) {
if(simAuthor.isPresent()) {
double th = THRESHOLD;
//increase the threshold if the surname is too short
if (simAuthor.get()._2().getSurname() != null && simAuthor.get()._2().getSurname().length()<=3)
th = 0.99;
if (simAuthor.get()._1() > th) {
Author r = simAuthor.get()._2(); Author r = simAuthor.get()._2();
if (r.getPid() == null) { if (r.getPid() == null) {
r.setPid(new ArrayList<>()); r.setPid(new ArrayList<>());
} }
r.getPid().add(a._1()); r.getPid().add(a._1());
} }
}
}); });
} }
public static String pidToComparableString(StructuredProperty pid) { public static String pidToComparableString(StructuredProperty pid) {
return (pid.getQualifier() != null return (pid.getQualifier() != null ?
? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : "" pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : "" : "")
: "") + (pid.getValue() != null ? pid.getValue().toLowerCase() : ""); + (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
} }
public static int countAuthorsPids(List<Author> authors) { public static int countAuthorsPids(List<Author> authors) {
@ -120,12 +123,14 @@ public class AuthorMerger {
final Person pa = parse(a); final Person pa = parse(a);
final Person pb = parse(b); final Person pb = parse(b);
//if both are accurate (e.g. they have name and surname)
if (pa.isAccurate() & pb.isAccurate()) { if (pa.isAccurate() & pb.isAccurate()) {
return new JaroWinkler() return
.score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())); new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString()))*0.5
+ new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString()))*0.5;
} else { } else {
return new JaroWinkler() return
.score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname())); new JaroWinkler().score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
} }
} }

View File

@ -21,6 +21,7 @@ import scala.Tuple2;
public class EntityMergerTest implements Serializable { public class EntityMergerTest implements Serializable {
List<Tuple2<String, Publication>> publications; List<Tuple2<String, Publication>> publications;
List<Tuple2<String, Publication>> publications2;
String testEntityBasePath; String testEntityBasePath;
DataInfo dataInfo; DataInfo dataInfo;
@ -36,6 +37,7 @@ public class EntityMergerTest implements Serializable {
.getAbsolutePath(); .getAbsolutePath();
publications = readSample(testEntityBasePath + "/publication_merge.json", Publication.class); publications = readSample(testEntityBasePath + "/publication_merge.json", Publication.class);
publications2 = readSample(testEntityBasePath + "/publication_merge2.json", Publication.class);
pub_top = getTopPub(publications); pub_top = getTopPub(publications);
@ -90,6 +92,17 @@ public class EntityMergerTest implements Serializable {
assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4); assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4);
} }
@Test
public void publicationMergerTest2() throws InstantiationException, IllegalAccessException, IOException {
Publication pub_merged = DedupRecordFactory
.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
assertEquals(pub_merged.getAuthor().size(), 27);
// insert assertions here
}
public DataInfo setDI() { public DataInfo setDI() {
DataInfo dataInfo = new DataInfo(); DataInfo dataInfo = new DataInfo();
dataInfo.setTrust("0.9"); dataInfo.setTrust("0.9");

File diff suppressed because one or more lines are too long