implementation of author dedup configuration and lnfi clustering function

This commit is contained in:
Michele De Bonis 2023-01-31 11:53:10 +01:00
parent 00466512ea
commit 66472ce408
7 changed files with 291 additions and 18 deletions

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,179 @@
{
"wf": {
"threshold": "0.99",
"dedupRun": "001",
"entityType": "author",
"subEntityType": "author",
"subEntityValue": "author",
"orderField": "fullname",
"queueMaxSize": "200",
"groupMaxSize": "100",
"maxChildren": "100",
"slidingWindowSize": "50",
"rootBuilder": [
"result",
"resultProject_outcome_isProducedBy",
"resultResult_publicationDataset_isRelatedTo",
"resultResult_similarity_isAmongTopNSimilarDocuments",
"resultResult_similarity_hasAmongTopNSimilarDocuments",
"resultOrganization_affiliation_isAffiliatedWith",
"resultResult_part_hasPart",
"resultResult_part_isPartOf",
"resultResult_supplement_isSupplementTo",
"resultResult_supplement_isSupplementedBy",
"resultResult_version_isVersionOf"
],
"includeChildren": "true",
"maxIterations": 20,
"idPath": "$.id"
},
"pace": {
"clustering" : [
{ "name" : "lnfi", "fields" : [ "name" ], "params" : {} }
],
"decisionTree": {
"start": {
"fields": [
{
"field": "pub_id",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold":1,
"aggregation": "AVG",
"positive": "NO_MATCH",
"negative": "yearCheck",
"undefined": "yearCheck"
},
"yearCheck": {
"fields": [
{
"field": "year",
"comparator": "numbersComparator",
"weight": 1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 50,
"aggregation": "MAX",
"positive": "NO_MATCH",
"negative": "surnames",
"undefined": "surnames",
"ignoreUndefined": "true"
},
"surnames": {
"fields": [
{
"field": "coauthors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"surname_th": 0.75,
"fullname_th": 0.75,
"size_th": 20,
"mode": "surname"
}
}
],
"threshold": 0.5,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "cityCheck",
"undefined": "cityCheck",
"ignoreUndefined": "true"
},
"cityCheck": {
"fields": [
{
"field": "org",
"comparator": "cityMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {
"windowSize": "4"
}
}
],
"threshold": 0.1,
"aggregation": "AVG",
"positive": "keywordCheck",
"negative": "NO_MATCH",
"undefined": "keywordCheck",
"ignoreUndefined": "true"
},
"keywordCheck": {
"fields": [
{
"field": "org",
"comparator": "keywordMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {
"windowSize": "4"
}
}
],
"threshold": 0.5,
"aggregation": "AVG",
"positive": "orgCheck",
"negative": "NO_MATCH",
"undefined": "orgCheck",
"ignoreUndefined": "true"
},
"orgCheck": {
"fields": [
{
"field": "org",
"comparator": "jaroWinklerNormalizedName",
"weight": 1,
"countIfUndefined": "true",
"params": {
"windowSize": "4"
}
}
],
"threshold": 0.7,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "MATCH",
"ignoreUndefined": "true"
}
},
"model": [
{
"name": "name",
"type": "String",
"path": "$.name"
},
{
"name": "coauthors",
"type": "List",
"path": "$.coauthors[*].name",
"size": 200
},
{
"name": "year",
"type": "String",
"path": "$.year"
},
{
"name": "pub_id",
"type": "String",
"path": "$.pub_id"
},
{
"name": "org",
"type": "String",
"path": "$.org"
}
],
"blacklists": {},
"synonyms": {}
}
}

View File

@ -29,26 +29,24 @@
},
"pace": {
"clustering" : [
{ "name" : "personClustering", "fields" : [ "fullname" ], "params" : {} },
{ "name" : "personHash", "fields" : [ "fullname" ], "params" : {} }
{ "name" : "lnfi", "fields" : [ "name" ], "params" : {} }
],
"decisionTree": {
"start": {
"fields": [
{
"field": "year",
"comparator": "numbersComparator",
"field": "pub_id",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 50,
"aggregation": "MAX",
"threshold":1,
"aggregation": "AVG",
"positive": "NO_MATCH",
"negative": "surnames",
"undefined": "surnames",
"ignoreUndefined": "true"
"negative": "yearCheck",
"undefined": "yearCheck"
},
"surnames": {
"fields": [
@ -65,7 +63,7 @@
}
}
],
"threshold": 0.6,
"threshold": 0.5,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
@ -75,7 +73,7 @@
},
"model": [
{
"name": "fullname",
"name": "name",
"type": "String",
"path": "$.name"
},
@ -88,12 +86,17 @@
{
"name": "year",
"type": "String",
"path": "$.publication.year"
"path": "$.year"
},
{
"name": "title",
"name": "pub_id",
"type": "String",
"path": "$.publication.title"
"path": "$.pub_id"
},
{
"name": "org",
"type": "String",
"path": "$.org"
}
],
"blacklists": {},

View File

@ -0,0 +1,77 @@
package eu.dnetlib.pace.clustering;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.Person;
import org.apache.commons.lang3.StringUtils;
import java.util.*;
import java.util.stream.Collectors;
@ClusteringClass("lnfi")
public class LastNameFirstInitial extends AbstractClusteringFunction{
private boolean DEFAULT_AGGRESSIVE = true;
public LastNameFirstInitial(final Map<String, Integer> params) {
super(params);
}
@Override
public Collection<String> apply(Config conf, List<Field> fields) {
return fields.stream().filter(f -> !f.isEmpty())
.map(Field::stringValue)
.map(this::normalize)
.map(s -> doApply(conf, s))
.map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream())
.filter(StringUtils::isNotBlank)
.collect(Collectors.toCollection(HashSet::new));
}
@Override
protected String normalize(final String s) {
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
.replaceAll("[^ \\w]+", "")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
@Override
protected Collection<String> doApply(final Config conf, final String s) {
final List<String> res = Lists.newArrayList();
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
Person p = new Person(s, aggressive);
if (p.isAccurate()) {
String lastName = p.getNormalisedSurname().toLowerCase();
String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0,1);
res.add(firstInitial.concat(lastName));
}
else { // is not accurate, meaning it has no defined name and surname
List<String> fullname = Arrays.asList(p.getNormalisedFullname().split(" "));
if (fullname.size() == 1) {
res.add(p.getNormalisedFullname().toLowerCase());
}
else if (fullname.size() == 2) {
res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase());
res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase());
}
else {
res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase());
res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase());
}
}
return res;
}
}

View File

@ -43,7 +43,7 @@ public class Person {
// s = s.replaceAll("[\\W&&[^,-]]", "");
}
if (s.contains(",")) {
if (s.contains(",")) { //if the name contains a comma it is easy derivable the name and the surname
final String[] arr = s.split(",");
if (arr.length == 1) {
fullname = splitTerms(arr[0]);

View File

@ -237,4 +237,13 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
}
@Test
public void testLastNameFirstInitial(){
final ClusteringFunction cf = new LastNameFirstInitial(params);
final String s = "LI Yonghong";
System.out.println("s = " + s);
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
}
}

View File

@ -30,6 +30,11 @@ public class UtilTest {
assertEquals("kennedy", p.getSurnameString());
assertEquals("j f", p.getNameString());
p = new Person("Guan-Hua Du", false);
System.out.println("surname = " + p.getSurnameString());
System.out.println("name = " + p.getNameString());
}
}