From 2f1ba56f616c84017e4f74fa2b089c0a1782fca1 Mon Sep 17 00:00:00 2001 From: miconis Date: Thu, 13 Jan 2022 11:58:28 +0100 Subject: [PATCH] bug fix in the authormatch comparator, implementation of tests --- .../pace/common/AbstractPaceFunctions.java | 7 ++ .../eu/dnetlib/pace/tree/AuthorsMatch.java | 102 +++++++++++++----- .../pace/util/BlockProcessorForTesting.java | 26 ++--- .../pace/comparators/ComparatorTest.java | 18 +++- .../java/eu/dnetlib/pace/util/UtilTest.java | 18 +++- 5 files changed, 124 insertions(+), 47 deletions(-) diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index 2f0fc4f45..858fe9801 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -16,6 +16,8 @@ import org.apache.commons.lang3.StringUtils; import java.io.IOException; import java.io.StringWriter; +import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; import java.text.Normalizer; import java.util.*; import java.util.function.Function; @@ -160,6 +162,11 @@ public abstract class AbstractPaceFunctions { return Normalizer.normalize(s, Normalizer.Form.NFD); } + public String utf8(final String s) { + byte[] bytes = s.getBytes(StandardCharsets.UTF_8); + return new String(bytes, StandardCharsets.UTF_8); + } + public String unicodeNormalization(final String s) { Matcher m = hexUnicodePattern.matcher(s); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java index ff9d49794..b5a56f6a4 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java @@ -1,13 +1,13 @@ package eu.dnetlib.pace.tree; import com.google.common.collect.Iterables; -import com.wcohen.ss.JaroWinkler; import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldList; import eu.dnetlib.pace.model.Person; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; +import com.wcohen.ss.AbstractStringDistance; import java.util.Comparator; import java.util.List; @@ -25,6 +25,7 @@ public class AuthorsMatch extends AbstractComparator { private double NAME_THRESHOLD; private double FULLNAME_THRESHOLD; private String MODE; //full or surname + private int common; public AuthorsMatch(Map params){ super(params, new com.wcohen.ss.JaroWinkler()); @@ -34,6 +35,11 @@ public class AuthorsMatch extends AbstractComparator { SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95")); NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95")); FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9")); + common = 0; + } + + protected AuthorsMatch(double w, AbstractStringDistance ssalgo) { + super(w, ssalgo); } @Override @@ -45,41 +51,85 @@ public class AuthorsMatch extends AbstractComparator { List aList = ((FieldList) a).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList()); List bList = ((FieldList) b).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList()); - int common = 0; + common = 0; + //compare each element of List1 with each element of List2 for (Person p1 : aList) - for (Person p2 : bList) - if(MODE.equals("full")) { - if (personComparator(p1, p2)) - common += 1; - } - else { - if (surnameComparator(p1, p2)) + + for (Person p2 : bList) { + + //both persons are inaccurate + if (!p1.isAccurate() && !p2.isAccurate()) { + //compare just normalized fullnames + if (ssalgo.score(normalization(p1.getNormalisedFullname()), normalization(p2.getNormalisedFullname())) > FULLNAME_THRESHOLD) { common += 1; + break; + } } - return (double)common / (aList.size() + bList.size() - common); + //one person is inaccurate + if (p1.isAccurate() ^ p2.isAccurate()) { + //prepare data + String name = p1.isAccurate()? normalization(p1.getNormalisedFirstName()) : normalization(p2.getNormalisedFirstName()); + String surname = p1.isAccurate()? normalization(p2.getNormalisedSurname()) : normalization(p2.getNormalisedSurname()); + + String fullname = p1.isAccurate()? normalization(p2.getNormalisedFullname()) : normalization(p1.getNormalisedFullname()); + + if (fullname.contains(surname)) { + if (MODE.equals("full")) { + if (fullname.contains(name)) { + common += 1; + break; + } + } + else { //MODE equals "surname" + common += 1; + break; + } + } + } + + //both persons are accurate + if (p1.isAccurate() && p2.isAccurate()) { + + if (compareSurname(p1, p2)) { + if (MODE.equals("full")) { + if(compareFirstname(p1, p2)) { + common += 1; + break; + } + } + else { //MODE equals "surname" + common += 1; + break; + } + } + + } + + } + + //normalization factor to compute the score + int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common); + + return (double)common / normFactor; } - public boolean personComparator(Person p1, Person p2) { - - if(!p1.isAccurate() || !p2.isAccurate()) - return ssalgo.score(p1.getOriginal(), p2.getOriginal()) > FULLNAME_THRESHOLD; - - if(ssalgo.score(p1.getSurnameString(),p2.getSurnameString()) > SURNAME_THRESHOLD) - if(p1.getNameString().length()<=2 || p2.getNameString().length()<=2) - return firstLC(p1.getNameString()).equals(firstLC(p2.getNameString())); - else - return ssalgo.score(p1.getNameString(), p2.getNameString()) > NAME_THRESHOLD; - else - return false; + public boolean compareSurname(Person p1, Person p2) { + return ssalgo.score(normalization(p1.getNormalisedSurname()), normalization(p2.getNormalisedSurname())) > SURNAME_THRESHOLD; } - public boolean surnameComparator(Person p1, Person p2) { + public boolean compareFirstname(Person p1, Person p2) { - if(!p1.isAccurate() || !p2.isAccurate()) - return ssalgo.score(p1.getOriginal(), p2.getOriginal()) > FULLNAME_THRESHOLD; + if(p1.getNormalisedFirstName().length()<=2 || p2.getNormalisedFirstName().length()<=2) { + if (firstLC(p1.getNormalisedFirstName()).equals(firstLC(p2.getNormalisedFirstName()))) + return true; + } - return ssalgo.score(p1.getSurnameString(), p2.getSurnameString()) > SURNAME_THRESHOLD; + return ssalgo.score(normalization(p1.getNormalisedFirstName()), normalization(p2.getNormalisedFirstName())) > NAME_THRESHOLD; + } + + public String normalization(String s) { + return normalize(utf8(cleanup(s))); } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java index d5e785af6..a6bef3f3c 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java @@ -36,23 +36,23 @@ public class BlockProcessorForTesting { this.dedupConf = dedupConf; } - public void processSortedBlock(final String key, final List documents, final Reporter context, boolean useTree) { + public void processSortedBlock(final String key, final List documents, final Reporter context, boolean useTree, boolean noMatch) { if (documents.size() > 1) { // log.info("reducing key: '" + key + "' records: " + q.size()); - process(prepare(documents), context, useTree); + process(prepare(documents), context, useTree, noMatch); } else { context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); } } - public void process(final String key, final Iterable documents, final Reporter context, boolean useTree) { + public void process(final String key, final Iterable documents, final Reporter context, boolean useTree, boolean noMatch) { final Queue q = prepare(documents); if (q.size() > 1) { // log.info("reducing key: '" + key + "' records: " + q.size()); - process(simplifyQueue(q, key, context), context, useTree); + process(simplifyQueue(q, key, context), context, useTree, noMatch); } else { context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); @@ -123,7 +123,7 @@ public class BlockProcessorForTesting { } } - private void process(final Queue queue, final Reporter context, boolean useTree) { + private void process(final Queue queue, final Reporter context, boolean useTree, boolean noMatch) { while (!queue.isEmpty()) { @@ -155,18 +155,18 @@ public class BlockProcessorForTesting { if (!idCurr.equals(idPivot) && (fieldCurr != null)) { - if(!compareInstanceType(pivot, curr, dedupConf)){ - emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context); + //draws no match relations (test purpose) + if (noMatch) { + emitOutput(!new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context); } else { - emitOutput(false, idPivot, idCurr, context); + //use the decision tree implementation or the "normal" implementation of the similarity score (valid only for publications) + if(useTree) + emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context); + else + emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context); } -// if(useTree) -// emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context); -// else -// emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context); - } } } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index 70a5b16b5..f4eee93e0 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -165,11 +165,6 @@ public class ComparatorTest extends AbstractPaceTest { result = jaroWinkler.distance("Victoria Dataverse", "Windsor Dataverse", conf); System.out.println("result = " + result); - final Levenstein levenstein = new Levenstein(params); - - result = levenstein.distance("Victoria", "Windsor", conf); - System.out.println("result = " + result); - } @Test @@ -182,6 +177,14 @@ public class ComparatorTest extends AbstractPaceTest { System.out.println("result = " + result); } + @Test + public void levensteinTest() { + final Levenstein levenstein = new Levenstein(params); + + double result = levenstein.distance("la bruzzo", "la bruzzo", conf); + System.out.println("result = " + result); + } + @Test public void instanceTypeMatchTest() { @@ -238,6 +241,11 @@ public class ComparatorTest extends AbstractPaceTest { assertEquals(1.0, result); + Field e = createFieldList(Arrays.asList("Manghi, Paolo", "Atzori, Claudio"), "authors"); + result = authorsMatch.compare(a, e, conf); + + assertEquals(0.25, result); + } @Test diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java index d8e0767d5..601831e67 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java @@ -1,23 +1,35 @@ package eu.dnetlib.pace.util; +import eu.dnetlib.pace.model.Person; +import jdk.nashorn.internal.ir.annotations.Ignore; import org.junit.jupiter.api.*; import java.util.HashMap; import java.util.Map; +import static org.junit.jupiter.api.Assertions.assertEquals; public class UtilTest { - Map params; + static Map params; @BeforeAll - public void setUp(){ - params = new HashMap(); + public static void setUp(){ + params = new HashMap<>(); } @Test + @Ignore public void paceResolverTest() { PaceResolver paceResolver = new PaceResolver(); paceResolver.getComparator("keywordMatch", params); } + @Test + public void personTest() { + Person p = new Person("j. f. kennedy", false); + + assertEquals("kennedy", p.getSurnameString()); + assertEquals("j f", p.getNameString()); + } + }