From 66472ce408eb99bd2c66f79d38667586db1f329f Mon Sep 17 00:00:00 2001 From: miconis Date: Tue, 31 Jan 2023 11:53:10 +0100 Subject: [PATCH] implementation of author dedup configuration and lnfi clustering function --- .../java/eu/dnetlib/pace/DedupLocalTest.java | 6 +- .../pace/config/authors.fdup.conf.json | 179 ++++++++++++++++++ ....conf.json => authors.fdup.soft.conf.json} | 31 +-- .../pace/clustering/LastNameFirstInitial.java | 77 ++++++++ .../java/eu/dnetlib/pace/model/Person.java | 2 +- .../clustering/ClusteringFunctionTest.java | 9 + .../java/eu/dnetlib/pace/util/UtilTest.java | 5 + 7 files changed, 291 insertions(+), 18 deletions(-) create mode 100644 dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/authors.fdup.conf.json rename dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/{auth.tree.conf.json => authors.fdup.soft.conf.json} (79%) create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java index 92c1070..b401516 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java @@ -238,11 +238,11 @@ public class DedupLocalTest extends DedupTestUtils { @Test //test the match between two JSON @Disabled public void matchTest() throws Exception { - String json1 = "{\"author\":[{\"affiliation\":[],\"fullname\":\"Hanayik, Taylor\",\"name\":\"Taylor\",\"pid\":[],\"rank\":1,\"surname\":\"Hanayik\"},{\"affiliation\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"University of South Carolina\"}],\"fullname\":\"Drake, Christopher\",\"name\":\"Christopher\",\"pid\":[],\"rank\":2,\"surname\":\"Drake\"},{\"affiliation\":[],\"fullname\":\"Rorden, Chris\",\"name\":\"Chris\",\"pid\":[],\"rank\":3,\"surname\":\"Rorden\"},{\"affiliation\":[],\"fullname\":\"Hardcastle, Nell\",\"name\":\"Nell\",\"pid\":[],\"rank\":4,\"surname\":\"Hardcastle\"},{\"affiliation\":[],\"fullname\":\"Androulakis, Anthony\",\"name\":\"Anthony\",\"pid\":[],\"rank\":5,\"surname\":\"Androulakis\"}],\"bestaccessright\":{\"classid\":\"OPEN\",\"classname\":\"Open Access\",\"schemeid\":\"dnet:access_modes\",\"schemename\":\"dnet:access_modes\"},\"collectedfrom\":[{\"key\":\"10|openaire____::9e3be59865b2c1c335d32dae2fe7b254\",\"value\":\"Datacite\"}],\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"dateofacceptance\":{\"value\":\"2022-03-02\"},\"dateofcollection\":\"2022-03-02T11:25:20+0000\",\"dateoftransformation\":\"2022-03-02T11:25:20+0000\",\"description\":[{\"value\":\"a WebGL2 based NIFTI volume viewer\"}],\"id\":\"50|doi_________::b596bfb411bbc62b902aedb11d0088d8\",\"instance\":[{\"accessright\":{\"classid\":\"OPEN\",\"classname\":\"Open Access\",\"schemeid\":\"dnet:access_modes\",\"schemename\":\"dnet:access_modes\"},\"collectedfrom\":{\"key\":\"10|openaire____::9e3be59865b2c1c335d32dae2fe7b254\",\"value\":\"Datacite\"},\"dateofacceptance\":{\"value\":\"2022-03-02\"},\"hostedby\":{\"key\":\"10|opendoar____::358aee4cc897452c00244351e4d91f69\",\"value\":\"ZENODO\"},\"instancetype\":{\"classid\":\"0029\",\"classname\":\"Software\",\"schemeid\":\"dnet:publication_resource\",\"schemename\":\"dnet:publication_resource\"},\"pid\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.5281/zenodo.5786269\"}],\"refereed\":{\"classid\":\"0000\",\"classname\":\"Unknown\",\"schemeid\":\"dnet:review_levels\",\"schemename\":\"dnet:review_levels\"},\"url\":[\"https://dx.doi.org/10.5281/zenodo.5786269\"]}],\"language\":{\"classid\":\"und\",\"classname\":\"Undetermined\",\"schemeid\":\"dnet:languages\",\"schemename\":\"dnet:languages\"},\"originalId\":[\"10.5281/zenodo.5786269\"],\"pid\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.5281/zenodo.5786269\"}],\"publisher\":{\"value\":\"Zenodo\"},\"relevantdate\":[{\"qualifier\":{\"classid\":\"issued\",\"classname\":\"issued\",\"schemeid\":\"dnet:dataCite_date\",\"schemename\":\"dnet:dataCite_date\"},\"value\":\"2022-03-02\"}],\"resourcetype\":{\"classid\":\"UNKNOWN\",\"classname\":\"Unknown\",\"schemeid\":\"dnet:dataCite_resource\",\"schemename\":\"dnet:dataCite_resource\"},\"resulttype\":{\"classid\":\"software\",\"classname\":\"software\",\"schemeid\":\"dnet:result_typologies\",\"schemename\":\"dnet:result_typologies\"},\"subject\":[],\"title\":[{\"qualifier\":{\"classid\":\"main title\",\"classname\":\"main title\",\"schemeid\":\"dnet:dataCite_title\",\"schemename\":\"dnet:dataCite_title\"},\"value\":\"niivue/niivue: 0.21.1\"}]}"; - String json2 = "{\"author\":[{\"affiliation\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"University of South Carolina\"}],\"fullname\":\"Chris Rorden\",\"name\":\"\",\"pid\":[],\"rank\":1,\"surname\":\"\"},{\"affiliation\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"University of Oxford\"}],\"fullname\":\"Taylor Hanayik\",\"name\":\"\",\"pid\":[],\"rank\":2,\"surname\":\"\"},{\"affiliation\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"University of South Carolina\"}],\"fullname\":\"Christopher Drake\",\"name\":\"\",\"pid\":[],\"rank\":3,\"surname\":\"\"},{\"affiliation\":[],\"fullname\":\"Nell Hardcastle\",\"name\":\"\",\"pid\":[],\"rank\":4,\"surname\":\"\"},{\"affiliation\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"University of South Carolina\"}],\"fullname\":\"Roger Newman-Norlund\",\"name\":\"\",\"pid\":[],\"rank\":5,\"surname\":\"\"}],\"bestaccessright\":{\"classid\":\"OPEN\",\"classname\":\"Open Access\",\"schemeid\":\"dnet:access_modes\",\"schemename\":\"dnet:access_modes\"},\"collectedfrom\":[{\"key\":\"10|opendoar____::358aee4cc897452c00244351e4d91f69\",\"value\":\"ZENODO\"}],\"context\":[],\"contributor\":[],\"country\":[],\"coverage\":[],\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"dateofacceptance\":{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"2021-12-16\"},\"dateofcollection\":\"2022-11-11T00:19:05+0000\",\"dateoftransformation\":\"2022-11-11T07:32:42.689Z\",\"description\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"

a WebGL2 based NIFTI volume viewer

\"}],\"documentationUrl\":[],\"eoscifguidelines\":[],\"externalReference\":[],\"extraInfo\":[],\"format\":[],\"fulltext\":[],\"id\":\"50|doi_________::59a7f397515febc2c7017fd2f866a777\",\"instance\":[{\"accessright\":{\"classid\":\"OPEN\",\"classname\":\"Open Access\",\"schemeid\":\"dnet:access_modes\",\"schemename\":\"dnet:access_modes\"},\"alternateIdentifier\":[],\"collectedfrom\":{\"key\":\"10|opendoar____::358aee4cc897452c00244351e4d91f69\",\"value\":\"ZENODO\"},\"dateofacceptance\":{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"2021-12-16\"},\"hostedby\":{\"key\":\"10|opendoar____::358aee4cc897452c00244351e4d91f69\",\"value\":\"ZENODO\"},\"instancetype\":{\"classid\":\"0029\",\"classname\":\"Software\",\"schemeid\":\"dnet:publication_resource\",\"schemename\":\"dnet:publication_resource\"},\"pid\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.5281/zenodo.5786270\"}],\"refereed\":{\"classid\":\"0000\",\"classname\":\"UNKNOWN\",\"schemeid\":\"dnet:review_levels\",\"schemename\":\"dnet:review_levels\"},\"url\":[\"http://dx.doi.org/10.5281/zenodo.5786270\",\"https://doi.org/10.5281/zenodo.5786270\"]}],\"language\":{\"classid\":\"UNKNOWN\",\"classname\":\"UNKNOWN\",\"schemeid\":\"dnet:languages\",\"schemename\":\"dnet:languages\"},\"lastupdatetimestamp\":1668556279928,\"license\":[],\"originalId\":[\"oai:zenodo.org:5786270\",\"50|od______2659::c8a3ff31e2ff537ebac22cdc4d7c64b0\"],\"pid\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.5281/zenodo.5786270\"}],\"programmingLanguage\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"},\"publisher\":{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"Zenodo\"},\"relevantdate\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"issued\",\"classname\":\"issued\",\"schemeid\":\"dnet:dataCite_date\",\"schemename\":\"dnet:dataCite_date\"},\"value\":\"2021-12-16\"}],\"resourcetype\":{\"classid\":\"UNKNOWN\",\"classname\":\"UNKNOWN\",\"schemeid\":\"dnet:dataCite_resource\",\"schemename\":\"dnet:dataCite_resource\"},\"resulttype\":{\"classid\":\"software\",\"classname\":\"software\",\"schemeid\":\"dnet:result_typologies\",\"schemename\":\"dnet:result_typologies\"},\"source\":[],\"subject\":[],\"title\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"main title\",\"classname\":\"main title\",\"schemeid\":\"dnet:dataCite_title\",\"schemename\":\"dnet:dataCite_title\"},\"value\":\"niivue/niivue: 0.13.0\"}]}"; + String json1 = "{\"id\": \"c5a91d78623c9fb6014cb1d3b941cdba\", \"name\": \"Yongkun Li\", \"org\": \"Yunnan Univ, Dept Math, Kunming 650091, Yunnan, Peoples R China\", \"pub_id\": \"OYE6nnOK\", \"gt_id\": \"uRrcChJK\", \"keywords\": [\"almost periodic solution\", \"global exponential stability\", \"neural networks\", \"time scales\"], \"venue\": \"MATHEMATICAL METHODS IN THE APPLIED SCIENCES\", \"year\": 2016, \"topics\": [0.00814664177596569, 0.035848259925842285, 0.009581967256963253, 0.0321519710123539, 0.3717975616455078, 0.014515174552798271, 0.28195104002952576, 0.01696726493537426, 0.011196448467671871, 0.21784362196922302], \"coauthors\": [{\"name\": \"Pan Wang\", \"org\": \"Yunnan Univ, Dept Math, Kunming 650091, Yunnan, Peoples R China\"}, {\"name\": \"Yuan Ye\", \"org\": \"Yunnan Univ, Grad Sch, Kunming 650091, Yunnan, Peoples R China\"}]}"; + String json2 = "{\"id\": \"a080d0025d6af103d070c5e2a597ce80\", \"name\": \"Yongkun Li\", \"org\": \"Department of Mathematics, Yunnan University, Kunming, Yunnan 650091, People’s Republic of China\", \"pub_id\": \"626RR8r5\", \"gt_id\": \"uRrcChJK\", \"keywords\": [\"Positive periodic solutions\", \"Delay competition system\", \"Coincidence degree\", \"Harvesting term\"], \"venue\": \"Nonlinear Analysis: Real World Applications\", \"year\": 2011, \"topics\": [0.008208894170820713, 0.034121282398700714, 0.009655232541263103, 0.02830354869365692, 0.37834030389785767, 0.014626123011112213, 0.284756064414978, 0.017096837982535362, 0.011282012797892094, 0.2136097401380539], \"coauthors\": [{\"name\": \"Kaihong Zhao\", \"org\": \"Department of Mathematics, Yunnan University, Kunming, Yunnan 650091, People’s Republic of China\"}, {\"name\": \"Yuan Ye\", \"org\": \"Graduate School of Yunnan University, Yunnan University, Kunming, Yunnan 650091, People’s Republic of China\"}]}"; DedupConfig config = DedupConfig.load(readFileFromHDFS(Paths - .get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/sw.tree.conf.json").toURI()) + .get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/authors.fdup.conf.json").toURI()) .toFile() .getAbsolutePath())); diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/authors.fdup.conf.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/authors.fdup.conf.json new file mode 100644 index 0000000..be611a8 --- /dev/null +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/authors.fdup.conf.json @@ -0,0 +1,179 @@ +{ + "wf": { + "threshold": "0.99", + "dedupRun": "001", + "entityType": "author", + "subEntityType": "author", + "subEntityValue": "author", + "orderField": "fullname", + "queueMaxSize": "200", + "groupMaxSize": "100", + "maxChildren": "100", + "slidingWindowSize": "50", + "rootBuilder": [ + "result", + "resultProject_outcome_isProducedBy", + "resultResult_publicationDataset_isRelatedTo", + "resultResult_similarity_isAmongTopNSimilarDocuments", + "resultResult_similarity_hasAmongTopNSimilarDocuments", + "resultOrganization_affiliation_isAffiliatedWith", + "resultResult_part_hasPart", + "resultResult_part_isPartOf", + "resultResult_supplement_isSupplementTo", + "resultResult_supplement_isSupplementedBy", + "resultResult_version_isVersionOf" + ], + "includeChildren": "true", + "maxIterations": 20, + "idPath": "$.id" + }, + "pace": { + "clustering" : [ + { "name" : "lnfi", "fields" : [ "name" ], "params" : {} } + ], + "decisionTree": { + "start": { + "fields": [ + { + "field": "pub_id", + "comparator": "exactMatch", + "weight": 1, + "countIfUndefined": "false", + "params": {} + } + ], + "threshold":1, + "aggregation": "AVG", + "positive": "NO_MATCH", + "negative": "yearCheck", + "undefined": "yearCheck" + }, + "yearCheck": { + "fields": [ + { + "field": "year", + "comparator": "numbersComparator", + "weight": 1, + "countIfUndefined": "false", + "params": {} + } + ], + "threshold": 50, + "aggregation": "MAX", + "positive": "NO_MATCH", + "negative": "surnames", + "undefined": "surnames", + "ignoreUndefined": "true" + }, + "surnames": { + "fields": [ + { + "field": "coauthors", + "comparator": "authorsMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": { + "surname_th": 0.75, + "fullname_th": 0.75, + "size_th": 20, + "mode": "surname" + } + } + ], + "threshold": 0.5, + "aggregation": "MAX", + "positive": "MATCH", + "negative": "cityCheck", + "undefined": "cityCheck", + "ignoreUndefined": "true" + }, + "cityCheck": { + "fields": [ + { + "field": "org", + "comparator": "cityMatch", + "weight": 1.0, + "countIfUndefined": "true", + "params": { + "windowSize": "4" + } + } + ], + "threshold": 0.1, + "aggregation": "AVG", + "positive": "keywordCheck", + "negative": "NO_MATCH", + "undefined": "keywordCheck", + "ignoreUndefined": "true" + }, + "keywordCheck": { + "fields": [ + { + "field": "org", + "comparator": "keywordMatch", + "weight": 1.0, + "countIfUndefined": "true", + "params": { + "windowSize": "4" + } + } + ], + "threshold": 0.5, + "aggregation": "AVG", + "positive": "orgCheck", + "negative": "NO_MATCH", + "undefined": "orgCheck", + "ignoreUndefined": "true" + }, + "orgCheck": { + "fields": [ + { + "field": "org", + "comparator": "jaroWinklerNormalizedName", + "weight": 1, + "countIfUndefined": "true", + "params": { + "windowSize": "4" + } + } + ], + "threshold": 0.7, + "aggregation": "AVG", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "MATCH", + "ignoreUndefined": "true" + } + }, + "model": [ + { + "name": "name", + "type": "String", + "path": "$.name" + }, + { + "name": "coauthors", + "type": "List", + "path": "$.coauthors[*].name", + "size": 200 + }, + { + "name": "year", + "type": "String", + "path": "$.year" + }, + { + "name": "pub_id", + "type": "String", + "path": "$.pub_id" + }, + { + "name": "org", + "type": "String", + "path": "$.org" + } + ], + "blacklists": {}, + "synonyms": {} + } +} \ No newline at end of file diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/auth.tree.conf.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/authors.fdup.soft.conf.json similarity index 79% rename from dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/auth.tree.conf.json rename to dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/authors.fdup.soft.conf.json index 0ac29f8..bfe13aa 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/auth.tree.conf.json +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/authors.fdup.soft.conf.json @@ -29,26 +29,24 @@ }, "pace": { "clustering" : [ - { "name" : "personClustering", "fields" : [ "fullname" ], "params" : {} }, - { "name" : "personHash", "fields" : [ "fullname" ], "params" : {} } + { "name" : "lnfi", "fields" : [ "name" ], "params" : {} } ], "decisionTree": { "start": { "fields": [ { - "field": "year", - "comparator": "numbersComparator", + "field": "pub_id", + "comparator": "exactMatch", "weight": 1, "countIfUndefined": "false", "params": {} } ], - "threshold": 50, - "aggregation": "MAX", + "threshold":1, + "aggregation": "AVG", "positive": "NO_MATCH", - "negative": "surnames", - "undefined": "surnames", - "ignoreUndefined": "true" + "negative": "yearCheck", + "undefined": "yearCheck" }, "surnames": { "fields": [ @@ -65,7 +63,7 @@ } } ], - "threshold": 0.6, + "threshold": 0.5, "aggregation": "MAX", "positive": "MATCH", "negative": "NO_MATCH", @@ -75,7 +73,7 @@ }, "model": [ { - "name": "fullname", + "name": "name", "type": "String", "path": "$.name" }, @@ -88,12 +86,17 @@ { "name": "year", "type": "String", - "path": "$.publication.year" + "path": "$.year" }, { - "name": "title", + "name": "pub_id", "type": "String", - "path": "$.publication.title" + "path": "$.pub_id" + }, + { + "name": "org", + "type": "String", + "path": "$.org" } ], "blacklists": {}, diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java new file mode 100644 index 0000000..7f86854 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java @@ -0,0 +1,77 @@ +package eu.dnetlib.pace.clustering; + +import com.google.common.collect.Lists; +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.Person; +import org.apache.commons.lang3.StringUtils; + +import java.util.*; +import java.util.stream.Collectors; + +@ClusteringClass("lnfi") +public class LastNameFirstInitial extends AbstractClusteringFunction{ + + private boolean DEFAULT_AGGRESSIVE = true; + + public LastNameFirstInitial(final Map params) { + super(params); + } + + @Override + public Collection apply(Config conf, List fields) { + return fields.stream().filter(f -> !f.isEmpty()) + .map(Field::stringValue) + .map(this::normalize) + .map(s -> doApply(conf, s)) + .map(c -> filterBlacklisted(c, ngramBlacklist)) + .flatMap(c -> c.stream()) + .filter(StringUtils::isNotBlank) + .collect(Collectors.toCollection(HashSet::new)); + } + + @Override + protected String normalize(final String s) { + return fixAliases(transliterate(nfd(unicodeNormalization(s)))) + // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings + .replaceAll("[^ \\w]+", "") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim(); + } + + @Override + protected Collection doApply(final Config conf, final String s) { + + final List res = Lists.newArrayList(); + + final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE); + + Person p = new Person(s, aggressive); + + if (p.isAccurate()) { + String lastName = p.getNormalisedSurname().toLowerCase(); + String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0,1); + + res.add(firstInitial.concat(lastName)); + } + else { // is not accurate, meaning it has no defined name and surname + List fullname = Arrays.asList(p.getNormalisedFullname().split(" ")); + if (fullname.size() == 1) { + res.add(p.getNormalisedFullname().toLowerCase()); + } + else if (fullname.size() == 2) { + res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase()); + res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase()); + } + else { + res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase()); + res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase()); + } + } + + return res; + } +} \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java index ec33406..543b1bd 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java @@ -43,7 +43,7 @@ public class Person { // s = s.replaceAll("[\\W&&[^,-]]", ""); } - if (s.contains(",")) { + if (s.contains(",")) { //if the name contains a comma it is easy derivable the name and the surname final String[] arr = s.split(","); if (arr.length == 1) { fullname = splitTerms(arr[0]); diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index 7a1d389..f57daaa 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -237,4 +237,13 @@ public class ClusteringFunctionTest extends AbstractPaceTest { } + @Test + public void testLastNameFirstInitial(){ + + final ClusteringFunction cf = new LastNameFirstInitial(params); + final String s = "LI Yonghong"; + System.out.println("s = " + s); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s)))); + } + } \ No newline at end of file diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java index 601831e..1e60532 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java @@ -30,6 +30,11 @@ public class UtilTest { assertEquals("kennedy", p.getSurnameString()); assertEquals("j f", p.getNameString()); + + p = new Person("Guan-Hua Du", false); + + System.out.println("surname = " + p.getSurnameString()); + System.out.println("name = " + p.getNameString()); } }