reimplementation of the sim between two authors. now it takes into account both name and surname. threshold incremented to 1.0 if the name is too short
This commit is contained in:
parent
70389b0a30
commit
0fd0c7d725
|
@ -20,12 +20,7 @@ public class AuthorMerger {
|
||||||
|
|
||||||
public static List<Author> merge(List<List<Author>> authors) {
|
public static List<Author> merge(List<List<Author>> authors) {
|
||||||
|
|
||||||
authors.sort(new Comparator<List<Author>>() {
|
authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)));
|
||||||
@Override
|
|
||||||
public int compare(List<Author> o1, List<Author> o2) {
|
|
||||||
return -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2));
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
List<Author> author = new ArrayList<>();
|
List<Author> author = new ArrayList<>();
|
||||||
|
|
||||||
|
@ -86,20 +81,28 @@ public class AuthorMerger {
|
||||||
.stream()
|
.stream()
|
||||||
.map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
|
.map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
|
||||||
.max(Comparator.comparing(Tuple2::_1));
|
.max(Comparator.comparing(Tuple2::_1));
|
||||||
if (simAuthor.isPresent() && simAuthor.get()._1() > THRESHOLD) {
|
|
||||||
Author r = simAuthor.get()._2();
|
if(simAuthor.isPresent()) {
|
||||||
if (r.getPid() == null) {
|
double th = THRESHOLD;
|
||||||
r.setPid(new ArrayList<>());
|
//increase the threshold if the surname is too short
|
||||||
|
if (simAuthor.get()._2().getSurname() != null && simAuthor.get()._2().getSurname().length()<=3)
|
||||||
|
th = 0.99;
|
||||||
|
|
||||||
|
if (simAuthor.get()._1() > th) {
|
||||||
|
Author r = simAuthor.get()._2();
|
||||||
|
if (r.getPid() == null) {
|
||||||
|
r.setPid(new ArrayList<>());
|
||||||
|
}
|
||||||
|
r.getPid().add(a._1());
|
||||||
}
|
}
|
||||||
r.getPid().add(a._1());
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String pidToComparableString(StructuredProperty pid) {
|
public static String pidToComparableString(StructuredProperty pid) {
|
||||||
return (pid.getQualifier() != null
|
return (pid.getQualifier() != null ?
|
||||||
? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
|
pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : "" : "")
|
||||||
: "") + (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
|
+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
|
||||||
}
|
}
|
||||||
|
|
||||||
public static int countAuthorsPids(List<Author> authors) {
|
public static int countAuthorsPids(List<Author> authors) {
|
||||||
|
@ -120,12 +123,14 @@ public class AuthorMerger {
|
||||||
final Person pa = parse(a);
|
final Person pa = parse(a);
|
||||||
final Person pb = parse(b);
|
final Person pb = parse(b);
|
||||||
|
|
||||||
|
//if both are accurate (e.g. they have name and surname)
|
||||||
if (pa.isAccurate() & pb.isAccurate()) {
|
if (pa.isAccurate() & pb.isAccurate()) {
|
||||||
return new JaroWinkler()
|
return
|
||||||
.score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString()));
|
new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString()))*0.5
|
||||||
|
+ new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString()))*0.5;
|
||||||
} else {
|
} else {
|
||||||
return new JaroWinkler()
|
return
|
||||||
.score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
|
new JaroWinkler().score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -21,6 +21,7 @@ import scala.Tuple2;
|
||||||
public class EntityMergerTest implements Serializable {
|
public class EntityMergerTest implements Serializable {
|
||||||
|
|
||||||
List<Tuple2<String, Publication>> publications;
|
List<Tuple2<String, Publication>> publications;
|
||||||
|
List<Tuple2<String, Publication>> publications2;
|
||||||
|
|
||||||
String testEntityBasePath;
|
String testEntityBasePath;
|
||||||
DataInfo dataInfo;
|
DataInfo dataInfo;
|
||||||
|
@ -36,6 +37,7 @@ public class EntityMergerTest implements Serializable {
|
||||||
.getAbsolutePath();
|
.getAbsolutePath();
|
||||||
|
|
||||||
publications = readSample(testEntityBasePath + "/publication_merge.json", Publication.class);
|
publications = readSample(testEntityBasePath + "/publication_merge.json", Publication.class);
|
||||||
|
publications2 = readSample(testEntityBasePath + "/publication_merge2.json", Publication.class);
|
||||||
|
|
||||||
pub_top = getTopPub(publications);
|
pub_top = getTopPub(publications);
|
||||||
|
|
||||||
|
@ -90,6 +92,17 @@ public class EntityMergerTest implements Serializable {
|
||||||
assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4);
|
assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void publicationMergerTest2() throws InstantiationException, IllegalAccessException, IOException {
|
||||||
|
|
||||||
|
Publication pub_merged = DedupRecordFactory
|
||||||
|
.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
|
||||||
|
|
||||||
|
assertEquals(pub_merged.getAuthor().size(), 27);
|
||||||
|
// insert assertions here
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
public DataInfo setDI() {
|
public DataInfo setDI() {
|
||||||
DataInfo dataInfo = new DataInfo();
|
DataInfo dataInfo = new DataInfo();
|
||||||
dataInfo.setTrust("0.9");
|
dataInfo.setTrust("0.9");
|
||||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue