implementation of author dedup configuration and lnfi clustering function
This commit is contained in:
parent
00466512ea
commit
66472ce408
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,179 @@
|
|||
{
|
||||
"wf": {
|
||||
"threshold": "0.99",
|
||||
"dedupRun": "001",
|
||||
"entityType": "author",
|
||||
"subEntityType": "author",
|
||||
"subEntityValue": "author",
|
||||
"orderField": "fullname",
|
||||
"queueMaxSize": "200",
|
||||
"groupMaxSize": "100",
|
||||
"maxChildren": "100",
|
||||
"slidingWindowSize": "50",
|
||||
"rootBuilder": [
|
||||
"result",
|
||||
"resultProject_outcome_isProducedBy",
|
||||
"resultResult_publicationDataset_isRelatedTo",
|
||||
"resultResult_similarity_isAmongTopNSimilarDocuments",
|
||||
"resultResult_similarity_hasAmongTopNSimilarDocuments",
|
||||
"resultOrganization_affiliation_isAffiliatedWith",
|
||||
"resultResult_part_hasPart",
|
||||
"resultResult_part_isPartOf",
|
||||
"resultResult_supplement_isSupplementTo",
|
||||
"resultResult_supplement_isSupplementedBy",
|
||||
"resultResult_version_isVersionOf"
|
||||
],
|
||||
"includeChildren": "true",
|
||||
"maxIterations": 20,
|
||||
"idPath": "$.id"
|
||||
},
|
||||
"pace": {
|
||||
"clustering" : [
|
||||
{ "name" : "lnfi", "fields" : [ "name" ], "params" : {} }
|
||||
],
|
||||
"decisionTree": {
|
||||
"start": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "pub_id",
|
||||
"comparator": "exactMatch",
|
||||
"weight": 1,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold":1,
|
||||
"aggregation": "AVG",
|
||||
"positive": "NO_MATCH",
|
||||
"negative": "yearCheck",
|
||||
"undefined": "yearCheck"
|
||||
},
|
||||
"yearCheck": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "year",
|
||||
"comparator": "numbersComparator",
|
||||
"weight": 1,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 50,
|
||||
"aggregation": "MAX",
|
||||
"positive": "NO_MATCH",
|
||||
"negative": "surnames",
|
||||
"undefined": "surnames",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"surnames": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "coauthors",
|
||||
"comparator": "authorsMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {
|
||||
"surname_th": 0.75,
|
||||
"fullname_th": 0.75,
|
||||
"size_th": 20,
|
||||
"mode": "surname"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 0.5,
|
||||
"aggregation": "MAX",
|
||||
"positive": "MATCH",
|
||||
"negative": "cityCheck",
|
||||
"undefined": "cityCheck",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"cityCheck": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "org",
|
||||
"comparator": "cityMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "true",
|
||||
"params": {
|
||||
"windowSize": "4"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 0.1,
|
||||
"aggregation": "AVG",
|
||||
"positive": "keywordCheck",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "keywordCheck",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"keywordCheck": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "org",
|
||||
"comparator": "keywordMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "true",
|
||||
"params": {
|
||||
"windowSize": "4"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 0.5,
|
||||
"aggregation": "AVG",
|
||||
"positive": "orgCheck",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "orgCheck",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"orgCheck": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "org",
|
||||
"comparator": "jaroWinklerNormalizedName",
|
||||
"weight": 1,
|
||||
"countIfUndefined": "true",
|
||||
"params": {
|
||||
"windowSize": "4"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 0.7,
|
||||
"aggregation": "AVG",
|
||||
"positive": "MATCH",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "MATCH",
|
||||
"ignoreUndefined": "true"
|
||||
}
|
||||
},
|
||||
"model": [
|
||||
{
|
||||
"name": "name",
|
||||
"type": "String",
|
||||
"path": "$.name"
|
||||
},
|
||||
{
|
||||
"name": "coauthors",
|
||||
"type": "List",
|
||||
"path": "$.coauthors[*].name",
|
||||
"size": 200
|
||||
},
|
||||
{
|
||||
"name": "year",
|
||||
"type": "String",
|
||||
"path": "$.year"
|
||||
},
|
||||
{
|
||||
"name": "pub_id",
|
||||
"type": "String",
|
||||
"path": "$.pub_id"
|
||||
},
|
||||
{
|
||||
"name": "org",
|
||||
"type": "String",
|
||||
"path": "$.org"
|
||||
}
|
||||
],
|
||||
"blacklists": {},
|
||||
"synonyms": {}
|
||||
}
|
||||
}
|
|
@ -29,26 +29,24 @@
|
|||
},
|
||||
"pace": {
|
||||
"clustering" : [
|
||||
{ "name" : "personClustering", "fields" : [ "fullname" ], "params" : {} },
|
||||
{ "name" : "personHash", "fields" : [ "fullname" ], "params" : {} }
|
||||
{ "name" : "lnfi", "fields" : [ "name" ], "params" : {} }
|
||||
],
|
||||
"decisionTree": {
|
||||
"start": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "year",
|
||||
"comparator": "numbersComparator",
|
||||
"field": "pub_id",
|
||||
"comparator": "exactMatch",
|
||||
"weight": 1,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 50,
|
||||
"aggregation": "MAX",
|
||||
"threshold":1,
|
||||
"aggregation": "AVG",
|
||||
"positive": "NO_MATCH",
|
||||
"negative": "surnames",
|
||||
"undefined": "surnames",
|
||||
"ignoreUndefined": "true"
|
||||
"negative": "yearCheck",
|
||||
"undefined": "yearCheck"
|
||||
},
|
||||
"surnames": {
|
||||
"fields": [
|
||||
|
@ -65,7 +63,7 @@
|
|||
}
|
||||
}
|
||||
],
|
||||
"threshold": 0.6,
|
||||
"threshold": 0.5,
|
||||
"aggregation": "MAX",
|
||||
"positive": "MATCH",
|
||||
"negative": "NO_MATCH",
|
||||
|
@ -75,7 +73,7 @@
|
|||
},
|
||||
"model": [
|
||||
{
|
||||
"name": "fullname",
|
||||
"name": "name",
|
||||
"type": "String",
|
||||
"path": "$.name"
|
||||
},
|
||||
|
@ -88,12 +86,17 @@
|
|||
{
|
||||
"name": "year",
|
||||
"type": "String",
|
||||
"path": "$.publication.year"
|
||||
"path": "$.year"
|
||||
},
|
||||
{
|
||||
"name": "title",
|
||||
"name": "pub_id",
|
||||
"type": "String",
|
||||
"path": "$.publication.title"
|
||||
"path": "$.pub_id"
|
||||
},
|
||||
{
|
||||
"name": "org",
|
||||
"type": "String",
|
||||
"path": "$.org"
|
||||
}
|
||||
],
|
||||
"blacklists": {},
|
|
@ -0,0 +1,77 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@ClusteringClass("lnfi")
|
||||
public class LastNameFirstInitial extends AbstractClusteringFunction{
|
||||
|
||||
private boolean DEFAULT_AGGRESSIVE = true;
|
||||
|
||||
public LastNameFirstInitial(final Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(Config conf, List<Field> fields) {
|
||||
return fields.stream().filter(f -> !f.isEmpty())
|
||||
.map(Field::stringValue)
|
||||
.map(this::normalize)
|
||||
.map(s -> doApply(conf, s))
|
||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||
.flatMap(c -> c.stream())
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String normalize(final String s) {
|
||||
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
|
||||
.replaceAll("[^ \\w]+", "")
|
||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
||||
.replaceAll("(\\p{Punct})+", " ")
|
||||
.replaceAll("(\\d)+", " ")
|
||||
.replaceAll("(\\n)+", " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final Config conf, final String s) {
|
||||
|
||||
final List<String> res = Lists.newArrayList();
|
||||
|
||||
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
|
||||
|
||||
Person p = new Person(s, aggressive);
|
||||
|
||||
if (p.isAccurate()) {
|
||||
String lastName = p.getNormalisedSurname().toLowerCase();
|
||||
String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0,1);
|
||||
|
||||
res.add(firstInitial.concat(lastName));
|
||||
}
|
||||
else { // is not accurate, meaning it has no defined name and surname
|
||||
List<String> fullname = Arrays.asList(p.getNormalisedFullname().split(" "));
|
||||
if (fullname.size() == 1) {
|
||||
res.add(p.getNormalisedFullname().toLowerCase());
|
||||
}
|
||||
else if (fullname.size() == 2) {
|
||||
res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase());
|
||||
res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase());
|
||||
}
|
||||
else {
|
||||
res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase());
|
||||
res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase());
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
}
|
|
@ -43,7 +43,7 @@ public class Person {
|
|||
// s = s.replaceAll("[\\W&&[^,-]]", "");
|
||||
}
|
||||
|
||||
if (s.contains(",")) {
|
||||
if (s.contains(",")) { //if the name contains a comma it is easy derivable the name and the surname
|
||||
final String[] arr = s.split(",");
|
||||
if (arr.length == 1) {
|
||||
fullname = splitTerms(arr[0]);
|
||||
|
|
|
@ -237,4 +237,13 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLastNameFirstInitial(){
|
||||
|
||||
final ClusteringFunction cf = new LastNameFirstInitial(params);
|
||||
final String s = "LI Yonghong";
|
||||
System.out.println("s = " + s);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
|
||||
}
|
||||
|
||||
}
|
|
@ -30,6 +30,11 @@ public class UtilTest {
|
|||
|
||||
assertEquals("kennedy", p.getSurnameString());
|
||||
assertEquals("j f", p.getNameString());
|
||||
|
||||
p = new Person("Guan-Hua Du", false);
|
||||
|
||||
System.out.println("surname = " + p.getSurnameString());
|
||||
System.out.println("name = " + p.getNameString());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue