Master branch updates from beta September 2023 #337

Manually merged
claudio.atzori merged 1271 commits from beta into master 2023-09-06 11:31:09 +02:00
5 changed files with 124 additions and 47 deletions
Showing only changes of commit 2f1ba56f61 - Show all commits

View File

@ -16,6 +16,8 @@ import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.*;
import java.util.function.Function;
@ -160,6 +162,11 @@ public abstract class AbstractPaceFunctions {
return Normalizer.normalize(s, Normalizer.Form.NFD);
}
public String utf8(final String s) {
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
return new String(bytes, StandardCharsets.UTF_8);
}
public String unicodeNormalization(final String s) {
Matcher m = hexUnicodePattern.matcher(s);

View File

@ -1,13 +1,13 @@
package eu.dnetlib.pace.tree;
import com.google.common.collect.Iterables;
import com.wcohen.ss.JaroWinkler;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.Person;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import com.wcohen.ss.AbstractStringDistance;
import java.util.Comparator;
import java.util.List;
@ -25,6 +25,7 @@ public class AuthorsMatch extends AbstractComparator {
private double NAME_THRESHOLD;
private double FULLNAME_THRESHOLD;
private String MODE; //full or surname
private int common;
public AuthorsMatch(Map<String, String> params){
super(params, new com.wcohen.ss.JaroWinkler());
@ -34,6 +35,11 @@ public class AuthorsMatch extends AbstractComparator {
SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95"));
NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95"));
FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9"));
common = 0;
}
protected AuthorsMatch(double w, AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
@ -45,41 +51,85 @@ public class AuthorsMatch extends AbstractComparator {
List<Person> aList = ((FieldList) a).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList());
List<Person> bList = ((FieldList) b).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList());
int common = 0;
common = 0;
//compare each element of List1 with each element of List2
for (Person p1 : aList)
for (Person p2 : bList)
if(MODE.equals("full")) {
if (personComparator(p1, p2))
common += 1;
}
else {
if (surnameComparator(p1, p2))
for (Person p2 : bList) {
//both persons are inaccurate
if (!p1.isAccurate() && !p2.isAccurate()) {
//compare just normalized fullnames
if (ssalgo.score(normalization(p1.getNormalisedFullname()), normalization(p2.getNormalisedFullname())) > FULLNAME_THRESHOLD) {
common += 1;
break;
}
}
return (double)common / (aList.size() + bList.size() - common);
//one person is inaccurate
if (p1.isAccurate() ^ p2.isAccurate()) {
//prepare data
String name = p1.isAccurate()? normalization(p1.getNormalisedFirstName()) : normalization(p2.getNormalisedFirstName());
String surname = p1.isAccurate()? normalization(p2.getNormalisedSurname()) : normalization(p2.getNormalisedSurname());
String fullname = p1.isAccurate()? normalization(p2.getNormalisedFullname()) : normalization(p1.getNormalisedFullname());
if (fullname.contains(surname)) {
if (MODE.equals("full")) {
if (fullname.contains(name)) {
common += 1;
break;
}
}
else { //MODE equals "surname"
common += 1;
break;
}
}
}
//both persons are accurate
if (p1.isAccurate() && p2.isAccurate()) {
if (compareSurname(p1, p2)) {
if (MODE.equals("full")) {
if(compareFirstname(p1, p2)) {
common += 1;
break;
}
}
else { //MODE equals "surname"
common += 1;
break;
}
}
}
}
//normalization factor to compute the score
int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common);
return (double)common / normFactor;
}
public boolean personComparator(Person p1, Person p2) {
if(!p1.isAccurate() || !p2.isAccurate())
return ssalgo.score(p1.getOriginal(), p2.getOriginal()) > FULLNAME_THRESHOLD;
if(ssalgo.score(p1.getSurnameString(),p2.getSurnameString()) > SURNAME_THRESHOLD)
if(p1.getNameString().length()<=2 || p2.getNameString().length()<=2)
return firstLC(p1.getNameString()).equals(firstLC(p2.getNameString()));
else
return ssalgo.score(p1.getNameString(), p2.getNameString()) > NAME_THRESHOLD;
else
return false;
public boolean compareSurname(Person p1, Person p2) {
return ssalgo.score(normalization(p1.getNormalisedSurname()), normalization(p2.getNormalisedSurname())) > SURNAME_THRESHOLD;
}
public boolean surnameComparator(Person p1, Person p2) {
public boolean compareFirstname(Person p1, Person p2) {
if(!p1.isAccurate() || !p2.isAccurate())
return ssalgo.score(p1.getOriginal(), p2.getOriginal()) > FULLNAME_THRESHOLD;
if(p1.getNormalisedFirstName().length()<=2 || p2.getNormalisedFirstName().length()<=2) {
if (firstLC(p1.getNormalisedFirstName()).equals(firstLC(p2.getNormalisedFirstName())))
return true;
}
return ssalgo.score(p1.getSurnameString(), p2.getSurnameString()) > SURNAME_THRESHOLD;
return ssalgo.score(normalization(p1.getNormalisedFirstName()), normalization(p2.getNormalisedFirstName())) > NAME_THRESHOLD;
}
public String normalization(String s) {
return normalize(utf8(cleanup(s)));
}
}

View File

@ -36,23 +36,23 @@ public class BlockProcessorForTesting {
this.dedupConf = dedupConf;
}
public void processSortedBlock(final String key, final List<MapDocument> documents, final Reporter context, boolean useTree) {
public void processSortedBlock(final String key, final List<MapDocument> documents, final Reporter context, boolean useTree, boolean noMatch) {
if (documents.size() > 1) {
// log.info("reducing key: '" + key + "' records: " + q.size());
process(prepare(documents), context, useTree);
process(prepare(documents), context, useTree, noMatch);
} else {
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
}
}
public void process(final String key, final Iterable<MapDocument> documents, final Reporter context, boolean useTree) {
public void process(final String key, final Iterable<MapDocument> documents, final Reporter context, boolean useTree, boolean noMatch) {
final Queue<MapDocument> q = prepare(documents);
if (q.size() > 1) {
// log.info("reducing key: '" + key + "' records: " + q.size());
process(simplifyQueue(q, key, context), context, useTree);
process(simplifyQueue(q, key, context), context, useTree, noMatch);
} else {
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
@ -123,7 +123,7 @@ public class BlockProcessorForTesting {
}
}
private void process(final Queue<MapDocument> queue, final Reporter context, boolean useTree) {
private void process(final Queue<MapDocument> queue, final Reporter context, boolean useTree, boolean noMatch) {
while (!queue.isEmpty()) {
@ -155,18 +155,18 @@ public class BlockProcessorForTesting {
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
if(!compareInstanceType(pivot, curr, dedupConf)){
emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
//draws no match relations (test purpose)
if (noMatch) {
emitOutput(!new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
}
else {
emitOutput(false, idPivot, idCurr, context);
//use the decision tree implementation or the "normal" implementation of the similarity score (valid only for publications)
if(useTree)
emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
else
emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
}
// if(useTree)
// emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
// else
// emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
}
}
}

View File

@ -165,11 +165,6 @@ public class ComparatorTest extends AbstractPaceTest {
result = jaroWinkler.distance("Victoria Dataverse", "Windsor Dataverse", conf);
System.out.println("result = " + result);
final Levenstein levenstein = new Levenstein(params);
result = levenstein.distance("Victoria", "Windsor", conf);
System.out.println("result = " + result);
}
@Test
@ -182,6 +177,14 @@ public class ComparatorTest extends AbstractPaceTest {
System.out.println("result = " + result);
}
@Test
public void levensteinTest() {
final Levenstein levenstein = new Levenstein(params);
double result = levenstein.distance("la bruzzo", "la bruzzo", conf);
System.out.println("result = " + result);
}
@Test
public void instanceTypeMatchTest() {
@ -238,6 +241,11 @@ public class ComparatorTest extends AbstractPaceTest {
assertEquals(1.0, result);
Field e = createFieldList(Arrays.asList("Manghi, Paolo", "Atzori, Claudio"), "authors");
result = authorsMatch.compare(a, e, conf);
assertEquals(0.25, result);
}
@Test

View File

@ -1,23 +1,35 @@
package eu.dnetlib.pace.util;
import eu.dnetlib.pace.model.Person;
import jdk.nashorn.internal.ir.annotations.Ignore;
import org.junit.jupiter.api.*;
import java.util.HashMap;
import java.util.Map;
import static org.junit.jupiter.api.Assertions.assertEquals;
public class UtilTest {
Map<String, String> params;
static Map<String, String> params;
@BeforeAll
public void setUp(){
params = new HashMap<String, String>();
public static void setUp(){
params = new HashMap<>();
}
@Test
@Ignore
public void paceResolverTest() {
PaceResolver paceResolver = new PaceResolver();
paceResolver.getComparator("keywordMatch", params);
}
@Test
public void personTest() {
Person p = new Person("j. f. kennedy", false);
assertEquals("kennedy", p.getSurnameString());
assertEquals("j f", p.getNameString());
}
}