bug fix in the authormatch comparator, implementation of tests
This commit is contained in:
parent
6eb0730188
commit
e168d95ec0
|
@ -1,2 +1 @@
|
|||
# Mon Sep 13 14:51:29 CEST 2021
|
||||
projectPropertyKey=projectPropertyValue
|
||||
# Thu Dec 30 13:11:51 CET 2021
|
||||
|
|
|
@ -85,7 +85,7 @@ public class Deduper implements Serializable {
|
|||
}
|
||||
|
||||
public static JavaRDD<Relation> computeRelations(
|
||||
JavaSparkContext context, JavaPairRDD<String, Block> blocks, DedupConfig config, boolean useTree) {
|
||||
JavaSparkContext context, JavaPairRDD<String, Block> blocks, DedupConfig config, boolean useTree, boolean noMatch) {
|
||||
Map<String, LongAccumulator> accumulators = Utility.constructAccumulator(config, context.sc());
|
||||
|
||||
return blocks
|
||||
|
@ -93,7 +93,7 @@ public class Deduper implements Serializable {
|
|||
it -> {
|
||||
final SparkReporter reporter = new SparkReporter(accumulators);
|
||||
new BlockProcessorForTesting(config)
|
||||
.processSortedBlock(it._1(), it._2().getDocuments(), reporter, useTree);
|
||||
.processSortedBlock(it._1(), it._2().getDocuments(), reporter, useTree, noMatch);
|
||||
return reporter.getRelations().iterator();
|
||||
})
|
||||
.mapToPair(it -> new Tuple2<>(it._1() + it._2(), new Relation(it._1(), it._2(), "simRel")))
|
||||
|
@ -101,7 +101,7 @@ public class Deduper implements Serializable {
|
|||
.map(Tuple2::_2);
|
||||
}
|
||||
|
||||
public static void createSimRels(DedupConfig dedupConf, SparkSession spark, String entitiesPath, String simRelsPath, boolean useTree){
|
||||
public static void createSimRels(DedupConfig dedupConf, SparkSession spark, String entitiesPath, String simRelsPath, boolean useTree, boolean noMatch){
|
||||
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
|
@ -117,7 +117,7 @@ public class Deduper implements Serializable {
|
|||
JavaPairRDD<String, Block> blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf);
|
||||
|
||||
// create relations by comparing only elements in the same group
|
||||
JavaRDD<Relation> relations = Deduper.computeRelations(sc, blocks, dedupConf, useTree);
|
||||
JavaRDD<Relation> relations = Deduper.computeRelations(sc, blocks, dedupConf, useTree, noMatch);
|
||||
|
||||
// save the simrel in the workingdir
|
||||
spark
|
||||
|
|
|
@ -83,7 +83,7 @@ public class SparkCreateSimRels extends AbstractSparkJob {
|
|||
JavaPairRDD<String, Block> blocks = Deduper.createSortedBlocks(mapDocuments, dedupConfig);
|
||||
|
||||
// create relations by comparing only elements in the same group
|
||||
JavaRDD<Relation> relations = Deduper.computeRelations(sc, blocks, dedupConfig, useTree);
|
||||
JavaRDD<Relation> relations = Deduper.computeRelations(sc, blocks, dedupConfig, useTree, false);
|
||||
|
||||
// save the simrel in the workingdir
|
||||
spark
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -51,6 +51,27 @@
|
|||
],
|
||||
"decisionTree": {
|
||||
"start": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "pid",
|
||||
"comparator": "jsonListMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {
|
||||
"jpath_value": "$.value",
|
||||
"jpath_classid": "$.qualifier.classid",
|
||||
"mode": "count"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 1.0,
|
||||
"aggregation": "MAX",
|
||||
"positive": "MATCH",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "instanceTypeCheck",
|
||||
"ignoreUndefined": "false"
|
||||
},
|
||||
"instanceTypeCheck": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "instance",
|
||||
|
@ -62,23 +83,13 @@
|
|||
],
|
||||
"threshold": 0.5,
|
||||
"aggregation": "MAX",
|
||||
"positive": "layer1",
|
||||
"positive": "pidVSaltid",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "layer1",
|
||||
"undefined": "pidVSaltid",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"layer1": {
|
||||
"pidVSaltid": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "pid",
|
||||
"comparator": "jsonListMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {
|
||||
"jpath_value": "$.value",
|
||||
"jpath_classid": "$.qualifier.classid"
|
||||
}
|
||||
},
|
||||
{
|
||||
"field": "pid",
|
||||
"comparator": "jsonListMatch",
|
||||
|
@ -87,18 +98,19 @@
|
|||
"params": {
|
||||
"jpath_value": "$.value",
|
||||
"jpath_classid": "$.qualifier.classid",
|
||||
"crossCompare": "alternateid"
|
||||
"crossCompare": "alternateid",
|
||||
"mode": "count"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 0.5,
|
||||
"threshold": 1.0,
|
||||
"aggregation": "MAX",
|
||||
"positive": "layer2",
|
||||
"negative": "layer3",
|
||||
"undefined": "layer3",
|
||||
"positive": "softCheck",
|
||||
"negative": "earlyExits",
|
||||
"undefined": "earlyExits",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"layer2": {
|
||||
"softCheck": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "title",
|
||||
|
@ -115,7 +127,7 @@
|
|||
"undefined": "NO_MATCH",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"layer3": {
|
||||
"earlyExits": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "title",
|
||||
|
@ -134,12 +146,12 @@
|
|||
],
|
||||
"threshold": 1.0,
|
||||
"aggregation": "AND",
|
||||
"positive": "layer4",
|
||||
"positive": "strongCheck",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "layer4",
|
||||
"undefined": "strongCheck",
|
||||
"ignoreUndefined": "false"
|
||||
},
|
||||
"layer4": {
|
||||
"strongCheck": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "title",
|
||||
|
@ -151,10 +163,31 @@
|
|||
],
|
||||
"threshold": 0.99,
|
||||
"aggregation": "AVG",
|
||||
"positive": "MATCH",
|
||||
"positive": "surnames",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "NO_MATCH",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"surnames": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "authors",
|
||||
"comparator": "authorsMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {
|
||||
"surname_th": 0.75,
|
||||
"fullname_th": 0.75,
|
||||
"mode": "surname"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 0.6,
|
||||
"aggregation": "MAX",
|
||||
"positive": "MATCH",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "MATCH",
|
||||
"ignoreUndefined": "true"
|
||||
}
|
||||
},
|
||||
"model": [
|
||||
|
|
|
@ -16,6 +16,8 @@ import org.apache.commons.lang3.StringUtils;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.Normalizer;
|
||||
import java.util.*;
|
||||
import java.util.function.Function;
|
||||
|
@ -160,6 +162,11 @@ public abstract class AbstractPaceFunctions {
|
|||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||
}
|
||||
|
||||
public String utf8(final String s) {
|
||||
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
|
||||
return new String(bytes, StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
public String unicodeNormalization(final String s) {
|
||||
|
||||
Matcher m = hexUnicodePattern.matcher(s);
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.wcohen.ss.JaroWinkler;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
@ -25,6 +25,7 @@ public class AuthorsMatch extends AbstractComparator {
|
|||
private double NAME_THRESHOLD;
|
||||
private double FULLNAME_THRESHOLD;
|
||||
private String MODE; //full or surname
|
||||
private int common;
|
||||
|
||||
public AuthorsMatch(Map<String, String> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
|
@ -34,6 +35,11 @@ public class AuthorsMatch extends AbstractComparator {
|
|||
SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95"));
|
||||
NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95"));
|
||||
FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9"));
|
||||
common = 0;
|
||||
}
|
||||
|
||||
protected AuthorsMatch(double w, AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -45,41 +51,85 @@ public class AuthorsMatch extends AbstractComparator {
|
|||
List<Person> aList = ((FieldList) a).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList());
|
||||
List<Person> bList = ((FieldList) b).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList());
|
||||
|
||||
int common = 0;
|
||||
common = 0;
|
||||
//compare each element of List1 with each element of List2
|
||||
for (Person p1 : aList)
|
||||
for (Person p2 : bList)
|
||||
|
||||
for (Person p2 : bList) {
|
||||
|
||||
//both persons are inaccurate
|
||||
if (!p1.isAccurate() && !p2.isAccurate()) {
|
||||
//compare just normalized fullnames
|
||||
if (ssalgo.score(normalization(p1.getNormalisedFullname()), normalization(p2.getNormalisedFullname())) > FULLNAME_THRESHOLD) {
|
||||
common += 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
//one person is inaccurate
|
||||
if (p1.isAccurate() ^ p2.isAccurate()) {
|
||||
//prepare data
|
||||
String name = p1.isAccurate()? normalization(p1.getNormalisedFirstName()) : normalization(p2.getNormalisedFirstName());
|
||||
String surname = p1.isAccurate()? normalization(p2.getNormalisedSurname()) : normalization(p2.getNormalisedSurname());
|
||||
|
||||
String fullname = p1.isAccurate()? normalization(p2.getNormalisedFullname()) : normalization(p1.getNormalisedFullname());
|
||||
|
||||
if (fullname.contains(surname)) {
|
||||
if (MODE.equals("full")) {
|
||||
if (personComparator(p1, p2))
|
||||
if (fullname.contains(name)) {
|
||||
common += 1;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
if (surnameComparator(p1, p2))
|
||||
}
|
||||
else { //MODE equals "surname"
|
||||
common += 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (double)common / (aList.size() + bList.size() - common);
|
||||
//both persons are accurate
|
||||
if (p1.isAccurate() && p2.isAccurate()) {
|
||||
|
||||
if (compareSurname(p1, p2)) {
|
||||
if (MODE.equals("full")) {
|
||||
if(compareFirstname(p1, p2)) {
|
||||
common += 1;
|
||||
break;
|
||||
}
|
||||
|
||||
public boolean personComparator(Person p1, Person p2) {
|
||||
|
||||
if(!p1.isAccurate() || !p2.isAccurate())
|
||||
return ssalgo.score(p1.getOriginal(), p2.getOriginal()) > FULLNAME_THRESHOLD;
|
||||
|
||||
if(ssalgo.score(p1.getSurnameString(),p2.getSurnameString()) > SURNAME_THRESHOLD)
|
||||
if(p1.getNameString().length()<=2 || p2.getNameString().length()<=2)
|
||||
return firstLC(p1.getNameString()).equals(firstLC(p2.getNameString()));
|
||||
else
|
||||
return ssalgo.score(p1.getNameString(), p2.getNameString()) > NAME_THRESHOLD;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean surnameComparator(Person p1, Person p2) {
|
||||
|
||||
if(!p1.isAccurate() || !p2.isAccurate())
|
||||
return ssalgo.score(p1.getOriginal(), p2.getOriginal()) > FULLNAME_THRESHOLD;
|
||||
|
||||
return ssalgo.score(p1.getSurnameString(), p2.getSurnameString()) > SURNAME_THRESHOLD;
|
||||
else { //MODE equals "surname"
|
||||
common += 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//normalization factor to compute the score
|
||||
int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common);
|
||||
|
||||
return (double)common / normFactor;
|
||||
}
|
||||
|
||||
public boolean compareSurname(Person p1, Person p2) {
|
||||
return ssalgo.score(normalization(p1.getNormalisedSurname()), normalization(p2.getNormalisedSurname())) > SURNAME_THRESHOLD;
|
||||
}
|
||||
|
||||
public boolean compareFirstname(Person p1, Person p2) {
|
||||
|
||||
if(p1.getNormalisedFirstName().length()<=2 || p2.getNormalisedFirstName().length()<=2) {
|
||||
if (firstLC(p1.getNormalisedFirstName()).equals(firstLC(p2.getNormalisedFirstName())))
|
||||
return true;
|
||||
}
|
||||
|
||||
return ssalgo.score(normalization(p1.getNormalisedFirstName()), normalization(p2.getNormalisedFirstName())) > NAME_THRESHOLD;
|
||||
}
|
||||
|
||||
public String normalization(String s) {
|
||||
return normalize(utf8(cleanup(s)));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -36,23 +36,23 @@ public class BlockProcessorForTesting {
|
|||
this.dedupConf = dedupConf;
|
||||
}
|
||||
|
||||
public void processSortedBlock(final String key, final List<MapDocument> documents, final Reporter context, boolean useTree) {
|
||||
public void processSortedBlock(final String key, final List<MapDocument> documents, final Reporter context, boolean useTree, boolean noMatch) {
|
||||
if (documents.size() > 1) {
|
||||
// log.info("reducing key: '" + key + "' records: " + q.size());
|
||||
process(prepare(documents), context, useTree);
|
||||
process(prepare(documents), context, useTree, noMatch);
|
||||
|
||||
} else {
|
||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
|
||||
}
|
||||
}
|
||||
|
||||
public void process(final String key, final Iterable<MapDocument> documents, final Reporter context, boolean useTree) {
|
||||
public void process(final String key, final Iterable<MapDocument> documents, final Reporter context, boolean useTree, boolean noMatch) {
|
||||
|
||||
final Queue<MapDocument> q = prepare(documents);
|
||||
|
||||
if (q.size() > 1) {
|
||||
// log.info("reducing key: '" + key + "' records: " + q.size());
|
||||
process(simplifyQueue(q, key, context), context, useTree);
|
||||
process(simplifyQueue(q, key, context), context, useTree, noMatch);
|
||||
|
||||
} else {
|
||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
|
||||
|
@ -123,7 +123,7 @@ public class BlockProcessorForTesting {
|
|||
}
|
||||
}
|
||||
|
||||
private void process(final Queue<MapDocument> queue, final Reporter context, boolean useTree) {
|
||||
private void process(final Queue<MapDocument> queue, final Reporter context, boolean useTree, boolean noMatch) {
|
||||
|
||||
while (!queue.isEmpty()) {
|
||||
|
||||
|
@ -155,18 +155,18 @@ public class BlockProcessorForTesting {
|
|||
|
||||
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
|
||||
|
||||
if(!compareInstanceType(pivot, curr, dedupConf)){
|
||||
emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
|
||||
//draws no match relations (test purpose)
|
||||
if (noMatch) {
|
||||
emitOutput(!new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
|
||||
}
|
||||
else {
|
||||
emitOutput(false, idPivot, idCurr, context);
|
||||
//use the decision tree implementation or the "normal" implementation of the similarity score (valid only for publications)
|
||||
if(useTree)
|
||||
emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
|
||||
else
|
||||
emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
|
||||
}
|
||||
|
||||
// if(useTree)
|
||||
// emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
|
||||
// else
|
||||
// emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -165,11 +165,6 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
result = jaroWinkler.distance("Victoria Dataverse", "Windsor Dataverse", conf);
|
||||
System.out.println("result = " + result);
|
||||
|
||||
final Levenstein levenstein = new Levenstein(params);
|
||||
|
||||
result = levenstein.distance("Victoria", "Windsor", conf);
|
||||
System.out.println("result = " + result);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -182,6 +177,14 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
System.out.println("result = " + result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void levensteinTest() {
|
||||
final Levenstein levenstein = new Levenstein(params);
|
||||
|
||||
double result = levenstein.distance("la bruzzo", "la bruzzo", conf);
|
||||
System.out.println("result = " + result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void instanceTypeMatchTest() {
|
||||
|
||||
|
@ -238,6 +241,11 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
|
||||
assertEquals(1.0, result);
|
||||
|
||||
Field e = createFieldList(Arrays.asList("Manghi, Paolo", "Atzori, Claudio"), "authors");
|
||||
result = authorsMatch.compare(a, e, conf);
|
||||
|
||||
assertEquals(0.25, result);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -1,23 +1,35 @@
|
|||
package eu.dnetlib.pace.util;
|
||||
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import jdk.nashorn.internal.ir.annotations.Ignore;
|
||||
import org.junit.jupiter.api.*;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
public class UtilTest {
|
||||
|
||||
Map<String, String> params;
|
||||
static Map<String, String> params;
|
||||
|
||||
@BeforeAll
|
||||
public void setUp(){
|
||||
params = new HashMap<String, String>();
|
||||
public static void setUp(){
|
||||
params = new HashMap<>();
|
||||
}
|
||||
|
||||
@Test
|
||||
@Ignore
|
||||
public void paceResolverTest() {
|
||||
PaceResolver paceResolver = new PaceResolver();
|
||||
paceResolver.getComparator("keywordMatch", params);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void personTest() {
|
||||
Person p = new Person("j. f. kennedy", false);
|
||||
|
||||
assertEquals("kennedy", p.getSurnameString());
|
||||
assertEquals("j f", p.getNameString());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue