implementation of new json comparator and update of the publication configuration

This commit is contained in:
miconis 2019-12-17 09:16:26 +01:00
parent d09193a094
commit 159cb2a493
22 changed files with 2001 additions and 651 deletions

View File

@ -13,6 +13,7 @@ import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import scala.Tuple2;
@ -30,7 +31,7 @@ public class DedupLocalTest extends DedupTestUtils {
@Before
public void setup() {
config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/config/organization.strict.conf.json", DedupLocalTest.class));
config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/config/publication.current.conf.json", DedupLocalTest.class));
treeProcessor = new TreeProcessor(config);
final SparkSession spark = SparkSession
@ -39,11 +40,13 @@ public class DedupLocalTest extends DedupTestUtils {
.master("local[*]")
.getOrCreate();
context = new JavaSparkContext(spark.sparkContext());
final URL dataset = getClass().getResource("/eu/dnetlib/pace/examples/organization.to.fix.json");
final URL dataset = getClass().getResource("/eu/dnetlib/pace/examples/publications.dump.1000.json");
entities = context.textFile(dataset.getPath());
}
@Ignore
@Test
public void dedupTest(){
@ -57,6 +60,7 @@ public class DedupLocalTest extends DedupTestUtils {
}
@Ignore
@Test
public void relationsTest() {
@ -112,7 +116,7 @@ public class DedupLocalTest extends DedupTestUtils {
}
@Ignore
@Test
public void matchTest(){
@ -128,7 +132,7 @@ public class DedupLocalTest extends DedupTestUtils {
}
@Ignore
@Test
public void parseJSONEntityTest(){
String jsonEntity = "{\"dateoftransformation\":\"2018-09-19\",\"originalId\":[\"doajarticles::Sociedade_Brasileira_de_Reumatologia\"],\"collectedfrom\":[{\"value\":\"DOAJ-Articles\",\"key\":\"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824\"}],\"organization\":{\"metadata\":{\"eclegalbody\":{\"value\":\"false\"},\"eclegalperson\":{\"value\":\"false\"},\"ecinternationalorganization\":{\"value\":\"false\"},\"legalshortname\":{\"value\":\"Sociedade Brasileira de Reumatologia\"},\"ecresearchorganization\":{\"value\":\"false\"},\"ecnonprofit\":{\"value\":\"false\"},\"ecenterprise\":{\"value\":\"false\"},\"ecnutscode\":{\"value\":\"false\"},\"ecinternationalorganizationeurinterests\":{\"value\":\"false\"},\"legalname\":{\"value\":\"Sociedade Brasileira de Reumatologia\"},\"country\":{\"classid\":\"BR\",\"classname\":\"Brazil\",\"schemename\":\"dnet:countries\",\"schemeid\":\"dnet:countries\"},\"echighereducation\":{\"value\":\"false\"},\"ecsmevalidated\":{\"value\":\"false\"}}},\"dateofcollection\":\"2018-09-19\",\"type\":20,\"id\":\"20|doajarticles::0019ba7a22c5bc733c3206bde28ff568\"}";

View File

@ -0,0 +1,387 @@
{
"wf": {
"threshold": "0.99",
"dedupRun": "001",
"entityType": "result",
"subEntityType": "resulttype",
"subEntityValue": "publication",
"orderField": "title",
"queueMaxSize": "2000",
"groupMaxSize": "100",
"maxChildren": "100",
"slidingWindowSize": "200",
"rootBuilder": [
"result",
"resultProject_outcome_isProducedBy",
"resultResult_publicationDataset_isRelatedTo",
"resultResult_similarity_isAmongTopNSimilarDocuments",
"resultResult_similarity_hasAmongTopNSimilarDocuments",
"resultOrganization_affiliation_isAffiliatedWith",
"resultResult_part_hasPart",
"resultResult_part_isPartOf",
"resultResult_supplement_isSupplementTo",
"resultResult_supplement_isSupplementedBy",
"resultResult_version_isVersionOf"
],
"includeChildren": "true",
"maxIterations": 20,
"idPath": "$.id"
},
"pace": {
"clustering": [
{
"name": "ngrampairs",
"fields": [
"title"
],
"params": {
"max": "1",
"ngramLen": "3"
}
},
{
"name": "suffixprefix",
"fields": [
"title"
],
"params": {
"max": "1",
"len": "3"
}
},
{
"name": "lowercase",
"fields": [
"doi"
],
"params": {}
}
],
"decisionTree": {
"start": {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"threshold": "0.5",
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid"
}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "layer2",
"undefined": "layer2",
"ignoreUndefined": "true"
},
"layer2": {
"fields": [
{
"field": "title",
"comparator": "titleVersionMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
},
{
"field": "authors",
"comparator": "sizeMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "NC",
"positive": "layer3",
"negative": "NO_MATCH",
"undefined": "layer3",
"ignoreUndefined": "false"
},
"layer3": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.99,
"aggregation": "SUM",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
}
},
"model": [
{
"name": "doi",
"type": "String",
"path": "$.pid[?(@.qualifier.classid == 'doi')].value"
},
{
"name": "pid",
"type": "JSON",
"path": "$.pid[*]",
"overrideMatch": "true"
},
{
"name": "title",
"type": "String",
"path": "$.title[?(@.qualifier.classid == 'main title')].value",
"length": 250,
"size": 5
},
{
"name": "authors",
"type": "List",
"path": "$.author[*].fullname[*]",
"size": 200
},
{
"name": "resulttype",
"type": "String",
"path": "$.resulttype.classid"
}
],
"blacklists": {
"title": [
"^Inside Front Cover$",
"(?i)^Poster presentations$",
"^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
"^Problems with perinatal pathology\\.?$",
"(?i)^Cases? of Puerperal Convulsions$",
"(?i)^Operative Gyna?ecology$",
"(?i)^Mind the gap\\!?\\:?$",
"^Chronic fatigue syndrome\\.?$",
"^Cartas? ao editor Letters? to the Editor$",
"^Note from the Editor$",
"^Anesthesia Abstract$",
"^Annual report$",
"(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
"(?i)^Graph and Table of Infectious Diseases?$",
"^Presentation$",
"(?i)^Reviews and Information on Publications$",
"(?i)^PUBLIC HEALTH SERVICES?$",
"(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
"(?i)^Adrese autora$",
"(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
"(?i)^Acknowledgement to Referees$",
"(?i)^Behçet's disease\\.?$",
"(?i)^Isolation and identification of restriction endonuclease.*$",
"(?i)^CEREBROVASCULAR DISEASES?.?$",
"(?i)^Screening for abdominal aortic aneurysms?\\.?$",
"^Event management$",
"(?i)^Breakfast and Crohn's disease.*\\.?$",
"^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
"(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
"^Gushi hakubutsugaku$",
"^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
"^Intestinal spirocha?etosis$",
"^Treatment of Rodent Ulcer$",
"(?i)^\\W*Cloud Computing\\W*$",
"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
"^Free Communications, Poster Presentations: Session [A-F]$",
"^“The Historical Aspects? of Quackery\\.?”$",
"^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
"^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
"(?i)^Case Report$",
"^Boletín Informativo$",
"(?i)^Glioblastoma Multiforme$",
"(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
"^Zaměstnanecké výhody$",
"(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
"(?i)^Carotid body tumours?\\.?$",
"(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
"^Avant-propos$",
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
"(?i)^PUBLIC HEALTH VERSUS THE STATE$",
"^Viñetas de Cortázar$",
"(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$",
"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
"(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
"(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
"(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
"^Aus der AGMB$",
"^Znanstveno-stručni prilozi$",
"(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
"(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
"(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
"^Finanční analýza podniku$",
"^Financial analysis( of business)?$",
"(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
"^Jikken nihon shūshinsho$",
"(?i)^CORONER('|s)(s|') INQUESTS$",
"(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
"(?i)^Consultants' contract(s)?$",
"(?i)^Upute autorima$",
"(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
"^Joshi shin kokubun$",
"^Kōtō shōgaku dokuhon nōson'yō$",
"^Jinjō shōgaku shōka$",
"^Shōgaku shūjichō$",
"^Nihon joshi dokuhon$",
"^Joshi shin dokuhon$",
"^Chūtō kanbun dokuhon$",
"^Wabun dokuhon$",
"(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
"(?i)^cardiac rehabilitation$",
"(?i)^Analytical summary$",
"^Thesaurus resolutionum Sacrae Congregationis Concilii$",
"(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
"^Prikazi i osvrti$",
"^Rodinný dům s provozovnou$",
"^Family house with an establishment$",
"^Shinsei chūtō shin kokugun$",
"^Pulmonary alveolar proteinosis(\\.?)$",
"^Shinshū kanbun$",
"^Viñeta(s?) de Rodríguez$",
"(?i)^RUBRIKA UREDNIKA$",
"^A Matching Model of the Academic Publication Market$",
"^Yōgaku kōyō$",
"^Internetový marketing$",
"^Internet marketing$",
"^Chūtō kokugo dokuhon$",
"^Kokugo dokuhon$",
"^Antibiotic Cover for Dental Extraction(s?)$",
"^Strategie podniku$",
"^Strategy of an Enterprise$",
"(?i)^respiratory disease(s?)(\\.?)$",
"^Award(s?) for Gallantry in Civil Defence$",
"^Podniková kultura$",
"^Corporate Culture$",
"^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$",
"^Pracovní motivace$",
"^Work Motivation$",
"^Kaitei kōtō jogaku dokuhon$",
"^Konsolidovaná účetní závěrka$",
"^Consolidated Financial Statements$",
"(?i)^intracranial tumour(s?)$",
"^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
"^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
"^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
"^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
"^Úroveň motivačního procesu jako způsobu vedení lidí$",
"^The level of motivation process as a leadership$",
"^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
"(?i)^news and events$",
"(?i)^NOVOSTI I DOGAĐAJI$",
"^Sansū no gakushū$",
"^Posouzení informačního systému firmy a návrh změn$",
"^Information System Assessment and Proposal for ICT Modification$",
"^Stresové zatížení pracovníků ve vybrané profesi$",
"^Stress load in a specific job$",
"^Sunday: Poster Sessions, Pt.*$",
"^Monday: Poster Sessions, Pt.*$",
"^Wednesday: Poster Sessions, Pt.*",
"^Tuesday: Poster Sessions, Pt.*$",
"^Analýza reklamy$",
"^Analysis of advertising$",
"^Shōgaku shūshinsho$",
"^Shōgaku sansū$",
"^Shintei joshi kokubun$",
"^Taishō joshi kokubun dokuhon$",
"^Joshi kokubun$",
"^Účetní uzávěrka a účetní závěrka v ČR$",
"(?i)^The \"?Causes\"? of Cancer$",
"^Normas para la publicación de artículos$",
"^Editor('|s)(s|') [Rr]eply$",
"^Editor(|s)(s|) letter$",
"^Redaktoriaus žodis$",
"^DISCUSSION ON THE PRECEDING PAPER$",
"^Kōtō shōgaku shūshinsho jidōyō$",
"^Shōgaku nihon rekishi$",
"^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
"^Préface$",
"^Occupational [Hh]ealth [Ss]ervices.$",
"^In Memoriam Professor Toshiyuki TAKESHIMA$",
"^Účetní závěrka ve vybraném podniku.*$",
"^Financial statements in selected company$",
"^Abdominal [Aa]ortic [Aa]neurysms.*$",
"^Pseudomyxoma peritonei$",
"^Kazalo autora$",
"(?i)^uvodna riječ$",
"^Motivace jako způsob vedení lidí$",
"^Motivation as a leadership$",
"^Polyfunkční dům$",
"^Multi\\-funkcional building$",
"^Podnikatelský plán$",
"(?i)^Podnikatelský záměr$",
"(?i)^Business Plan$",
"^Oceňování nemovitostí$",
"^Marketingová komunikace$",
"^Marketing communication$",
"^Sumario Analítico$",
"^Riječ uredništva$",
"^Savjetovanja i priredbe$",
"^Índice$",
"^(Starobosanski nadpisi).*$",
"^Vzdělávání pracovníků v organizaci$",
"^Staff training in organization$",
"^(Life Histories of North American Geometridae).*$",
"^Strategická analýza podniku$",
"^Strategic Analysis of an Enterprise$",
"^Sadržaj$",
"^Upute suradnicima$",
"^Rodinný dům$",
"(?i)^Fami(l)?ly house$",
"^Upute autorima$",
"^Strategic Analysis$",
"^Finanční analýza vybraného podniku$",
"^Finanční analýza$",
"^Riječ urednika$",
"(?i)^Content(s?)$",
"(?i)^Inhalt$",
"^Jinjō shōgaku shūshinsho jidōyō$",
"(?i)^Index$",
"^Chūgaku kokubun kyōkasho$",
"^Retrato de una mujer$",
"^Retrato de un hombre$",
"^Kōtō shōgaku dokuhon$",
"^Shotōka kokugo$",
"^Shōgaku dokuhon$",
"^Jinjō shōgaku kokugo dokuhon$",
"^Shinsei kokugo dokuhon$",
"^Teikoku dokuhon$",
"^Instructions to Authors$",
"^KİTAP TAHLİLİ$",
"^PRZEGLĄD PIŚMIENNICTWA$",
"(?i)^Presentación$",
"^İçindekiler$",
"(?i)^Tabl?e of contents$",
"^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
"^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
"^Editorial( Board)?$",
"(?i)^Editorial \\(English\\)$",
"^Editörden$",
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
"^(Kiri Karl Morgensternile).*$",
"^(\\[Eksliibris Aleksandr).*\\]$",
"^(\\[Eksliibris Aleksandr).*$",
"^(Eksliibris Aleksandr).*$",
"^(Kiri A\\. de Vignolles).*$",
"^(2 kirja Karl Morgensternile).*$",
"^(Pirita kloostri idaosa arheoloogilised).*$",
"^(Kiri tundmatule).*$",
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
"^(Eksliibris Nikolai Birukovile).*$",
"^(Eksliibris Nikolai Issakovile).*$",
"^(WHP Cruise Summary Information of section).*$",
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*",
"(?i)^.*authors[']? reply\\.?$",
"(?i)^.*authors[']? response\\.?$"
]
},
"synonyms": {}
}
}

File diff suppressed because one or more lines are too long

View File

@ -1,57 +0,0 @@
package eu.dnetlib.pace.model.adaptor;
import java.util.List;
import com.google.common.base.Function;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.reflect.TypeToken;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* Created by claudio on 01/03/16.
*/
public class Pid {
private static final Log log = LogFactory.getLog(Pid.class);
private String value;
private String type;
public static List<Pid> fromOafJson(final List<String> json) {
log.debug(String.format("\nPid: %s", json));
final GsonBuilder gb = new GsonBuilder();
gb.registerTypeAdapter(Pid.class, new PidOafSerialiser());
final Gson gson = gb.create();
return Lists.newArrayList(Iterables.transform(json, new Function<String, Pid>() {
@Override
public Pid apply(final String s) {
return gson.fromJson(s, Pid.class);
}
}));
}
public String getType() {
return type;
}
public void setType(final String type) {
this.type = type;
}
public String getValue() {
return value;
}
public void setValue(final String value) {
this.value = value;
}
}

View File

@ -1,50 +0,0 @@
package eu.dnetlib.pace.model.adaptor;
import java.lang.reflect.Type;
import java.util.List;
import com.google.common.collect.Lists;
import com.google.gson.*;
import eu.dnetlib.pace.model.gt.GTAuthor;
/**
* Created by claudio on 01/03/16.
*/
public class PidOafSerialiser implements JsonDeserializer<Pid> {
private static final String VALUE = "value";
private static final String QUALIFIER = "qualifier";
private static final String CLASSID = "classid";
@Override
public Pid deserialize(final JsonElement json, final Type typeOfT, final JsonDeserializationContext context) throws JsonParseException {
final Pid pid = new Pid();
pid.setType(getType(json));
pid.setValue(getValue(json));
return pid;
}
private String getValue(final JsonElement json) {
final JsonObject obj =json.getAsJsonObject();
return obj.get(VALUE).getAsString();
}
private String getType(final JsonElement json) {
final JsonObject obj =json.getAsJsonObject();
if (!obj.has(QUALIFIER))
throw new IllegalArgumentException("pid does not contain any type: " + json.toString());
final JsonObject qualifier = obj.getAsJsonObject(QUALIFIER);
final JsonElement classid = qualifier.get(CLASSID);
return classid.getAsString();
}
}

View File

@ -0,0 +1,70 @@
package eu.dnetlib.pace.tree;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
@ComparatorClass("jsonListMatch")
public class JsonListMatch extends AbstractComparator {
private static final Log log = LogFactory.getLog(JsonListMatch.class);
private Map<String, String> params;
public JsonListMatch(final Map<String, String> params) {
super(params);
this.params = params;
}
@Override
public double compare(final Field a, final Field b, final Config conf) {
final List<String> sa = ((FieldList) a).stringList();
final List<String> sb = ((FieldList) b).stringList();
if (sa.isEmpty() || sb.isEmpty()) {
return -1;
}
final Set<String> ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet());
final Set<String> cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet());
int incommon = Sets.intersection(ca, cb).size();
int simDiff = Sets.symmetricDifference(ca, cb).size();
if (incommon + simDiff == 0) {
return 0.0;
}
return (double)incommon / (incommon + simDiff) > Double.parseDouble(params.getOrDefault("threshold", "0.5")) ? 1 : 0;
}
//converts every json into a comparable string basing on parameters
private String toComparableString(String json){
StringBuilder st = new StringBuilder(); //to build the string used for comparisons basing on the jpath into parameters
//for each path in the param list
for (String key: params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) {
String path = params.get(key);
String value = MapDocumentUtil.getJPathString(path, json);
if (value == null || value.isEmpty())
value = "";
st.append( value + "::");
}
st.setLength(st.length()-2);
return st.toString();
}
}

View File

@ -1,64 +0,0 @@
package eu.dnetlib.pace.tree;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.adaptor.Pid;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
@ComparatorClass("pidMatch")
public class PidMatch extends AbstractComparator {
private static final Log log = LogFactory.getLog(PidMatch.class);
private Map<String, String> params;
public PidMatch(final Map<String, String> params) {
super(params);
this.params = params;
}
@Override
public double compare(final Field a, final Field b, final Config conf) {
final List<String> sa = ((FieldList) a).stringList();
final List<String> sb = ((FieldList) b).stringList();
final List<Pid> pal = Pid.fromOafJson(sa);
final List<Pid> pbl = Pid.fromOafJson(sb);
if (pal.isEmpty() || pbl.isEmpty()) {
return -1;
}
final Set<String> pidAset = toHashSet(pal);
final Set<String> pidBset = toHashSet(pbl);
int incommon = Sets.intersection(pidAset, pidBset).size();
int simDiff = Sets.symmetricDifference(pidAset, pidBset).size();
if (incommon + simDiff == 0) {
return 0.0;
}
return (double)incommon / (incommon + simDiff) > Double.parseDouble(params.getOrDefault("threshold", "0.5")) ? 1 : 0;
}
//lowercase + normalization of the pid before adding it to the set
private Set<String> toHashSet(List<Pid> pbl) {
return pbl.stream()
.map(pid -> pid.getType() + normalizePid(pid.getValue()))
.collect(Collectors.toCollection(HashSet::new));
}
}

View File

@ -0,0 +1,47 @@
package eu.dnetlib.pace.tree;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
@ComparatorClass("stringListMatch")
public class StringListMatch extends AbstractComparator {
private static final Log log = LogFactory.getLog(StringListMatch.class);
private Map<String, String> params;
public StringListMatch(final Map<String, String> params) {
super(params);
this.params = params;
}
@Override
public double compare(final Field a, final Field b, final Config conf) {
final Set<String> pa = new HashSet<>(((FieldList) a).stringList());
final Set<String> pb = new HashSet<>(((FieldList) b).stringList());
if (pa.isEmpty() || pb.isEmpty()) {
return -1; //return undefined if one of the two lists of pids is empty
}
int incommon = Sets.intersection(pa, pb).size();
int simDiff = Sets.symmetricDifference(pa, pb).size();
if (incommon + simDiff == 0) {
return 0.0;
}
return (double)incommon / (incommon + simDiff) > Double.parseDouble(params.getOrDefault("threshold", "0.5")) ? 1 : 0;
}
}

View File

@ -2,7 +2,9 @@ package eu.dnetlib.pace.util;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.jayway.jsonpath.Configuration;
import com.jayway.jsonpath.JsonPath;
import com.jayway.jsonpath.Option;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field;
@ -55,7 +57,7 @@ public class MapDocumentUtil {
public static List<String> getJPathList(String path, String json, Type type) {
if (type == Type.List)
return JsonPath.read(json, path);
return JsonPath.using(Configuration.defaultConfiguration().addOptions(Option.ALWAYS_RETURN_LIST, Option.SUPPRESS_EXCEPTIONS)).parse(json).read(path);
Object jresult;
List<String> result = new ArrayList<>();
try {

View File

@ -18,7 +18,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
@Before
public void setUp() throws Exception {
params = Maps.newHashMap();
conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf", ClusteringFunctionTest.class));
conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ClusteringFunctionTest.class));
}
@Test
@ -110,15 +110,6 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
}
@Test
public void testPersonClustering2() {
final ClusteringFunction cf = new PersonClustering(params);
final String s = readFromClasspath("gt.author.json");
System.out.println(s);
System.out.println(cf.apply(conf, Lists.newArrayList(person(s))));
}
@Test
public void testKeywordsClustering() {

View File

@ -4,7 +4,6 @@ import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.tree.*;
import eu.dnetlib.pace.config.DedupConfig;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
@ -24,7 +23,7 @@ public class ComparatorTest extends AbstractPaceFunctions {
public void setup() {
params = new HashMap<>();
params.put("weight", "1.0");
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf", ComparatorTest.class));
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class));
}
@ -115,5 +114,9 @@ public class ComparatorTest extends AbstractPaceFunctions {
System.out.println("result = " + result);
}
@Test
public void jsonListMatchTest() {
}
}

View File

@ -2,11 +2,15 @@ package eu.dnetlib.pace.config;
import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.tree.JsonListMatch;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.apache.commons.io.IOUtils;
import org.junit.Test;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static org.junit.Assert.assertEquals;
@ -16,7 +20,7 @@ public class ConfigTest extends AbstractPaceTest {
@Test
public void dedupConfigSerializationTest() {
final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("organization.current.conf"));
final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("organization.current.conf.json"));
final String conf = cfgFromClasspath.toString();
@ -26,13 +30,12 @@ public class ConfigTest extends AbstractPaceTest {
assertNotNull(cfgFromClasspath);
assertNotNull(cfgFromSerialization);
}
@Test
public void dedupConfigTest() {
DedupConfig load = DedupConfig.load(readFromClasspath("organization.current.conf"));
DedupConfig load = DedupConfig.load(readFromClasspath("organization.current.conf.json"));
System.out.println(load.toString());
}
@ -40,7 +43,7 @@ public class ConfigTest extends AbstractPaceTest {
@Test
public void initTranslationMapTest() {
DedupConfig load = DedupConfig.load(readFromClasspath("organization.current.conf"));
DedupConfig load = DedupConfig.load(readFromClasspath("organization.current.conf.json"));
Map<String, String> translationMap = load.translationMap();
@ -50,38 +53,26 @@ public class ConfigTest extends AbstractPaceTest {
if (translationMap.get(key).equals("key::1"))
System.out.println("key = " + key);
}
}
@Test
public void emptyTranslationMapTest() {
DedupConfig load = DedupConfig.load(readFromClasspath("organization.no_synonyms.conf"));
DedupConfig load = DedupConfig.load(readFromClasspath("organization.no_synonyms.conf.json"));
assertEquals(0, load.getPace().translationMap().keySet().size());
}
@Test
public void testAsMapDocumentJPath() throws Exception {
DedupConfig load = DedupConfig.load(readFromClasspath("result.pace.conf_jpath.json"));
System.out.println(load.getWf().getIdPath());
final String result =IOUtils.toString(this.getClass().getResourceAsStream("result.json"));
System.out.println(result);
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(load, result);
System.out.println(mapDocument.getFieldMap());
}
public void asMapDocumentTest() throws Exception {
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json"));
final String json = readFromClasspath("publication.json");
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
System.out.println("mapDocument = " + mapDocument.getFieldMap());
}
}

View File

@ -1,126 +0,0 @@
package eu.dnetlib.pace.model;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.util.Set;
import org.junit.Test;
public class PersonComparatorUtilsNGramsTest {
@Test
public void testNormaizePerson_1() {
verifyGetNgramsForPerson("Artini Michele", 2, "a_michele", "m_artini");
}
@Test
public void testNormaizePerson_2() {
verifyGetNgramsForPerson("Michele Artini", 2, "a_michele", "m_artini");
}
@Test
public void testNormaizePerson_3() {
verifyGetNgramsForPerson("Michele ARTINI", 1, "m_artini");
}
@Test
public void testNormaizePerson_4() {
verifyGetNgramsForPerson("ARTINI Michele", 1, "m_artini");
}
@Test
public void testNormaizePerson_5() {
verifyGetNgramsForPerson("Michele G. Artini", 2, "m_artini", "g_artini");
}
@Test
public void testNormaizePerson_6() {
verifyGetNgramsForPerson(" Artini, Michele ", 1, "m_artini");
}
@Test
public void testNormaizePerson_7() {
verifyGetNgramsForPerson("Artini, Michele (sig.)", 1, "m_artini");
}
@Test
public void testNormaizePerson_8() {
verifyGetNgramsForPerson("Artini Michele [sig.] ", 2, "a_michele", "m_artini");
}
@Test
public void testNormaizePerson_9() {
verifyGetNgramsForPerson("Artini, M", 1, "m_artini");
}
@Test
public void testNormaizePerson_10() {
verifyGetNgramsForPerson("Artini, M.", 1, "m_artini");
}
@Test
public void testNormaizePerson_11() {
verifyGetNgramsForPerson("Artini, M. (sig.)", 1, "m_artini");
}
@Test
public void testNormaizePerson_12() {
verifyGetNgramsForPerson("Artini, M[sig.] ", 1, "m_artini");
}
@Test
public void testNormaizePerson_13() {
verifyGetNgramsForPerson("Artini-SIG, Michele ", 1, "m_artini-sig");
}
@Test
public void testNormaizePerson_14() {
verifyGetNgramsForPerson("Artini - SIG, Michele ", 1, "m_artini-sig");
}
@Test
public void testNormaizePerson_15() {
verifyGetNgramsForPerson("Artini {sig.}, M", 1, "m_artini");
}
@Test
public void testNormaizePerson_16() {
verifyGetNgramsForPerson("Artini, M., sig.", 1, "m_artini");
}
@Test
public void testNormaizePerson_17() {
verifyGetNgramsForPerson("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA, BBBBBBBBBBBBBBBBBBBBBBBBBBBBB CCCCCCCCCCCCCCCCCCCC", 0);
}
@Test
public void testNormaizePerson_18() {
verifyGetNgramsForPerson("Dell'amico, Andrea", 1, "a_amico");
}
@Test
public void testNormaizePerson_19() {
verifyGetNgramsForPerson("Smith, Paul van der", 1, "p_smith");
}
@Test
public void testNormaizePerson_20() {
verifyGetNgramsForPerson("AAAAAAA, BBBB, CCCC, DDDD, EEEE", 1, "b_aaaaaaa");
}
@Test
public void testNormaizePerson_21() {
verifyGetNgramsForPerson("Kompetenzzentrum Informelle Bildung (KIB),", 6);
}
private void verifyGetNgramsForPerson(String name, int expectedSize, String... expectedTokens) {
Set<String> list = PersonComparatorUtils.getNgramsForPerson(name);
System.out.println(list);
assertEquals(expectedSize, list.size());
for (String s : expectedTokens) {
assertTrue(list.contains(s));
}
}
}

View File

@ -1,89 +0,0 @@
package eu.dnetlib.pace.model;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import org.junit.Test;
public class PersonComparatorUtilsSimilarityTest {
@Test
public void testSimilarity_0() {
assertTrue(PersonComparatorUtils.areSimilar("Artini Michele", "Michele Artini"));
}
@Test
public void testSimilarity_1() {
assertTrue(PersonComparatorUtils.areSimilar("ARTINI Michele", "Artini, Michele"));
}
@Test
public void testSimilarity_2() {
assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Artini Michele"));
}
@Test
public void testSimilarity_3() {
assertTrue(PersonComparatorUtils.areSimilar("Artini, M.G.", "Artini, Michele"));
}
@Test
public void testSimilarity_4() {
assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, M.G."));
}
@Test
public void testSimilarity_5() {
assertTrue(PersonComparatorUtils.areSimilar("Artini, M. (sig.)", "Artini, Michele"));
}
@Test
public void testSimilarity_6() {
assertFalse(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, G."));
}
@Test
public void testSimilarity_7() {
assertFalse(PersonComparatorUtils.areSimilar("Artini, M.G.", "Artini, M.A."));
}
@Test
public void testSimilarity_8() {
assertFalse(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, Giuseppe"));
}
@Test
public void testSimilarity_9() {
assertFalse(PersonComparatorUtils.areSimilar("Manghi, Paolo", "Artini, Michele"));
}
@Test
public void testSimilarity_10() {
assertTrue(PersonComparatorUtils.areSimilar("Artini, Michele", "Artini, Michele Giovanni"));
}
@Test
public void testSimilarity_11() {
assertFalse(PersonComparatorUtils.areSimilar("Artini, M.A.G.", "Artini, M.B.G."));
}
@Test
public void testSimilarity_12() {
assertFalse(PersonComparatorUtils.areSimilar("Artini Manghi, M.", "Artini, Michele"));
}
@Test
public void testSimilarity_13() {
assertTrue(PersonComparatorUtils.areSimilar("Artini Manghi, M.", "Artini Manghi Michele"));
}
@Test
public void testSimilarity_14() {
assertFalse(PersonComparatorUtils.areSimilar("Artini, Michele", "Michele, Artini"));
}
@Test
public void testSimilarity_15() {
assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Michele ARTINI"));
}
}

View File

@ -1,111 +0,0 @@
package eu.dnetlib.pace.model;
import static org.junit.Assert.assertEquals;
import java.text.Normalizer;
import java.util.Queue;
import org.junit.Test;
import com.google.common.collect.Lists;
public class PersonTest {
@Test
public void test_1() {
check("Atzori, Claudio", "Atzori, Claudio");
}
@Test
public void test_2() {
check("Atzori, Claudio A.", "Atzori, Claudio A.");
}
@Test
public void test_3() {
check("Claudio ATZORI", "Atzori, Claudio");
}
@Test
public void test_4() {
check("ATZORI, Claudio", "Atzori, Claudio");
}
@Test
public void test_5() {
check("Claudio Atzori", "Claudio Atzori");
}
@Test
public void test_6() {
check(" Manghi , Paolo", "Manghi, Paolo");
}
@Test
public void test_7() {
check("ATZORI, CLAUDIO", "Atzori, Claudio");
}
@Test
public void test_8() {
check("ATZORI, CLAUDIO A", "Atzori, Claudio A.");
}
@Test
public void test_9() {
check("Bølviken, B.", "Bølviken, B.");
}
@Test
public void test_10() {
check("Bñlviken, B.", "B" + Normalizer.normalize("ñ", Normalizer.Form.NFD) + "lviken, B.");
}
@Test
public void test_11() {
check("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰ ø", "Aaeeiioooouuuu, Aaeeiioooouuuu Ø.", true);
}
@Test
public void test_12() {
check("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰz ø", Normalizer.normalize("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰz ø", Normalizer.Form.NFD), false);
}
@Test
public void test_13() {
check("Tkačíková, Daniela", Normalizer.normalize("Tkačíková, Daniela", Normalizer.Form.NFD), false);
}
@Test
public void test_hashes() {
checkHash(" Claudio ATZORI ", "ATZORI Claudio", "Atzori , Claudio", "ATZORI, Claudio");
}
private void checkHash(String... ss) {
Queue<String> q = Lists.newLinkedList(Lists.newArrayList(ss));
String h1 = new Person(q.remove(), false).hash();
while (!q.isEmpty()) {
assertEquals(h1, new Person(q.remove(), false).hash());
}
}
private void check(String s, String expectedFullName) {
check(s, expectedFullName, false);
}
private void check(String s, String expectedFullName, boolean aggressive) {
Person p = new Person(s, aggressive);
System.out.println("original: " + p.getOriginal());
System.out.println("accurate: " + p.isAccurate());
System.out.println("normalised: '" + p.getNormalisedFullname() + "'");
if (p.isAccurate()) {
System.out.println("name: " + p.getNormalisedFirstName());
System.out.println("surname: " + p.getNormalisedSurname());
}
System.out.println("hash: " + p.hash());
System.out.println("");
assertEquals(expectedFullName, p.getNormalisedFullname());
}
}

File diff suppressed because one or more lines are too long

View File

@ -18,16 +18,81 @@
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
],
"decisionTree" : {
"start": {"fields": [{"field":"gridid", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"true", "params": {}}], "threshold":1.0, "aggregation": "MAX", "positive":"MATCH", "negative":"layer2", "undefined":"layer2", "ignoreUndefined": "true"},
"layer2": {"fields": [{"field":"websiteurl", "comparator":"domainExactMatch", "weight":1.0, "countIfUndefined":"true", "params" : {}}, {"field":"country", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}], "threshold":1.0, "aggregation": "MIN", "positive":"layer3", "negative":"NO_MATCH", "undefined":"layer3", "ignoreUndefined": "false"},
"layer3": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "countIfUndefined":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "countIfUndefined":"true", "params":{}}], "threshold": 0.9, "aggregation": "W_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "true"}
"start": {
"fields": [
{
"field": "gridid",
"comparator": "exactMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "layer2",
"undefined": "layer2",
"ignoreUndefined": "true"
},
"layer2": {
"fields": [
{
"field": "websiteurl",
"comparator": "domainExactMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
},
{
"field": "country",
"comparator": "exactMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "MIN",
"positive": "layer3",
"negative": "NO_MATCH",
"undefined": "layer3",
"ignoreUndefined": "false"
},
"layer3": {
"fields": [
{
"field": "legalname",
"comparator": "jaroWinklerNormalizedName",
"weight": 0.9,
"countIfUndefined": "false",
"params": {
"windowSize": 4,
"threshold": 0.7
}
},
{
"field": "legalshortname",
"comparator": "jaroWinklerNormalizedName",
"weight": 0.1,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.9,
"aggregation": "W_MEAN",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
}
},
"model" : [
{ "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"},
{ "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"},
{ "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" },
{ "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"}
{ "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"},
{ "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"},
{ "name" : "legalname", "type" : "String", "path" : "$organization.metadata.legalname.value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" },
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid ==\"grid\")].value"}
],
"blacklists" : {
"legalname" : []

View File

@ -23,11 +23,11 @@
"layer3": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "countIfUndefined":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "countIfUndefined":"true", "params":{}}], "threshold": 0.9, "aggregation": "W_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "true"}
},
"model" : [
{ "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"},
{ "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"},
{ "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" },
{ "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"}
{ "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"},
{ "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"},
{ "name" : "legalname", "type" : "String", "path" : "$organization.metadata.legalname.value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" },
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid ==\"grid\")].value"}
],
"blacklists" : {
"legalname" : []

View File

@ -0,0 +1,387 @@
{
"wf": {
"threshold": "0.99",
"dedupRun": "001",
"entityType": "result",
"subEntityType": "resulttype",
"subEntityValue": "publication",
"orderField": "title",
"queueMaxSize": "2000",
"groupMaxSize": "100",
"maxChildren": "100",
"slidingWindowSize": "200",
"rootBuilder": [
"result",
"resultProject_outcome_isProducedBy",
"resultResult_publicationDataset_isRelatedTo",
"resultResult_similarity_isAmongTopNSimilarDocuments",
"resultResult_similarity_hasAmongTopNSimilarDocuments",
"resultOrganization_affiliation_isAffiliatedWith",
"resultResult_part_hasPart",
"resultResult_part_isPartOf",
"resultResult_supplement_isSupplementTo",
"resultResult_supplement_isSupplementedBy",
"resultResult_version_isVersionOf"
],
"includeChildren": "true",
"maxIterations": 20,
"idPath": "$.entity.id"
},
"pace": {
"clustering": [
{
"name": "ngrampairs",
"fields": [
"title"
],
"params": {
"max": "1",
"ngramLen": "3"
}
},
{
"name": "suffixprefix",
"fields": [
"title"
],
"params": {
"max": "1",
"len": "3"
}
},
{
"name": "lowercase",
"fields": [
"doi"
],
"params": {}
}
],
"decisionTree": {
"start": {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"threshold": "0.5",
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid"
}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "layer2",
"undefined": "layer2",
"ignoreUndefined": "true"
},
"layer2": {
"fields": [
{
"field": "title",
"comparator": "titleVersionMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
},
{
"field": "authors",
"comparator": "sizeMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "NC",
"positive": "layer3",
"negative": "NO_MATCH",
"undefined": "layer3",
"ignoreUndefined": "false"
},
"layer3": {
"fields": [
{
"field": "title",
"comparator": "LevensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.99,
"aggregation": "SUM",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
}
},
"model": [
{
"name": "doi",
"type": "String",
"path": "$.pid[?(@.qualifier.classid == 'doi')].value"
},
{
"name": "pid",
"type": "JSON",
"path": "$.pid[*]",
"overrideMatch": "true"
},
{
"name": "title",
"type": "String",
"path": "$.title[?(@.qualifier.classid == 'main title')].value",
"length": 250,
"size": 5
},
{
"name": "authors",
"type": "List",
"path": "$.author[*].fullname",
"size": 200
},
{
"name": "resulttype",
"type": "String",
"path": "$.resulttype.classid"
}
],
"blacklists": {
"title": [
"^Inside Front Cover$",
"(?i)^Poster presentations$",
"^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
"^Problems with perinatal pathology\\.?$",
"(?i)^Cases? of Puerperal Convulsions$",
"(?i)^Operative Gyna?ecology$",
"(?i)^Mind the gap\\!?\\:?$",
"^Chronic fatigue syndrome\\.?$",
"^Cartas? ao editor Letters? to the Editor$",
"^Note from the Editor$",
"^Anesthesia Abstract$",
"^Annual report$",
"(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
"(?i)^Graph and Table of Infectious Diseases?$",
"^Presentation$",
"(?i)^Reviews and Information on Publications$",
"(?i)^PUBLIC HEALTH SERVICES?$",
"(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
"(?i)^Adrese autora$",
"(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
"(?i)^Acknowledgement to Referees$",
"(?i)^Behçet's disease\\.?$",
"(?i)^Isolation and identification of restriction endonuclease.*$",
"(?i)^CEREBROVASCULAR DISEASES?.?$",
"(?i)^Screening for abdominal aortic aneurysms?\\.?$",
"^Event management$",
"(?i)^Breakfast and Crohn's disease.*\\.?$",
"^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
"(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
"^Gushi hakubutsugaku$",
"^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
"^Intestinal spirocha?etosis$",
"^Treatment of Rodent Ulcer$",
"(?i)^\\W*Cloud Computing\\W*$",
"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
"^Free Communications, Poster Presentations: Session [A-F]$",
"^“The Historical Aspects? of Quackery\\.?”$",
"^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
"^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
"(?i)^Case Report$",
"^Boletín Informativo$",
"(?i)^Glioblastoma Multiforme$",
"(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
"^Zaměstnanecké výhody$",
"(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
"(?i)^Carotid body tumours?\\.?$",
"(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
"^Avant-propos$",
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
"(?i)^PUBLIC HEALTH VERSUS THE STATE$",
"^Viñetas de Cortázar$",
"(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$",
"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
"(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
"(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
"(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
"^Aus der AGMB$",
"^Znanstveno-stručni prilozi$",
"(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
"(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
"(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
"^Finanční analýza podniku$",
"^Financial analysis( of business)?$",
"(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
"^Jikken nihon shūshinsho$",
"(?i)^CORONER('|s)(s|') INQUESTS$",
"(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
"(?i)^Consultants' contract(s)?$",
"(?i)^Upute autorima$",
"(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
"^Joshi shin kokubun$",
"^Kōtō shōgaku dokuhon nōson'yō$",
"^Jinjō shōgaku shōka$",
"^Shōgaku shūjichō$",
"^Nihon joshi dokuhon$",
"^Joshi shin dokuhon$",
"^Chūtō kanbun dokuhon$",
"^Wabun dokuhon$",
"(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
"(?i)^cardiac rehabilitation$",
"(?i)^Analytical summary$",
"^Thesaurus resolutionum Sacrae Congregationis Concilii$",
"(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
"^Prikazi i osvrti$",
"^Rodinný dům s provozovnou$",
"^Family house with an establishment$",
"^Shinsei chūtō shin kokugun$",
"^Pulmonary alveolar proteinosis(\\.?)$",
"^Shinshū kanbun$",
"^Viñeta(s?) de Rodríguez$",
"(?i)^RUBRIKA UREDNIKA$",
"^A Matching Model of the Academic Publication Market$",
"^Yōgaku kōyō$",
"^Internetový marketing$",
"^Internet marketing$",
"^Chūtō kokugo dokuhon$",
"^Kokugo dokuhon$",
"^Antibiotic Cover for Dental Extraction(s?)$",
"^Strategie podniku$",
"^Strategy of an Enterprise$",
"(?i)^respiratory disease(s?)(\\.?)$",
"^Award(s?) for Gallantry in Civil Defence$",
"^Podniková kultura$",
"^Corporate Culture$",
"^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$",
"^Pracovní motivace$",
"^Work Motivation$",
"^Kaitei kōtō jogaku dokuhon$",
"^Konsolidovaná účetní závěrka$",
"^Consolidated Financial Statements$",
"(?i)^intracranial tumour(s?)$",
"^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
"^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
"^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
"^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
"^Úroveň motivačního procesu jako způsobu vedení lidí$",
"^The level of motivation process as a leadership$",
"^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
"(?i)^news and events$",
"(?i)^NOVOSTI I DOGAĐAJI$",
"^Sansū no gakushū$",
"^Posouzení informačního systému firmy a návrh změn$",
"^Information System Assessment and Proposal for ICT Modification$",
"^Stresové zatížení pracovníků ve vybrané profesi$",
"^Stress load in a specific job$",
"^Sunday: Poster Sessions, Pt.*$",
"^Monday: Poster Sessions, Pt.*$",
"^Wednesday: Poster Sessions, Pt.*",
"^Tuesday: Poster Sessions, Pt.*$",
"^Analýza reklamy$",
"^Analysis of advertising$",
"^Shōgaku shūshinsho$",
"^Shōgaku sansū$",
"^Shintei joshi kokubun$",
"^Taishō joshi kokubun dokuhon$",
"^Joshi kokubun$",
"^Účetní uzávěrka a účetní závěrka v ČR$",
"(?i)^The \"?Causes\"? of Cancer$",
"^Normas para la publicación de artículos$",
"^Editor('|s)(s|') [Rr]eply$",
"^Editor(|s)(s|) letter$",
"^Redaktoriaus žodis$",
"^DISCUSSION ON THE PRECEDING PAPER$",
"^Kōtō shōgaku shūshinsho jidōyō$",
"^Shōgaku nihon rekishi$",
"^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
"^Préface$",
"^Occupational [Hh]ealth [Ss]ervices.$",
"^In Memoriam Professor Toshiyuki TAKESHIMA$",
"^Účetní závěrka ve vybraném podniku.*$",
"^Financial statements in selected company$",
"^Abdominal [Aa]ortic [Aa]neurysms.*$",
"^Pseudomyxoma peritonei$",
"^Kazalo autora$",
"(?i)^uvodna riječ$",
"^Motivace jako způsob vedení lidí$",
"^Motivation as a leadership$",
"^Polyfunkční dům$",
"^Multi\\-funkcional building$",
"^Podnikatelský plán$",
"(?i)^Podnikatelský záměr$",
"(?i)^Business Plan$",
"^Oceňování nemovitostí$",
"^Marketingová komunikace$",
"^Marketing communication$",
"^Sumario Analítico$",
"^Riječ uredništva$",
"^Savjetovanja i priredbe$",
"^Índice$",
"^(Starobosanski nadpisi).*$",
"^Vzdělávání pracovníků v organizaci$",
"^Staff training in organization$",
"^(Life Histories of North American Geometridae).*$",
"^Strategická analýza podniku$",
"^Strategic Analysis of an Enterprise$",
"^Sadržaj$",
"^Upute suradnicima$",
"^Rodinný dům$",
"(?i)^Fami(l)?ly house$",
"^Upute autorima$",
"^Strategic Analysis$",
"^Finanční analýza vybraného podniku$",
"^Finanční analýza$",
"^Riječ urednika$",
"(?i)^Content(s?)$",
"(?i)^Inhalt$",
"^Jinjō shōgaku shūshinsho jidōyō$",
"(?i)^Index$",
"^Chūgaku kokubun kyōkasho$",
"^Retrato de una mujer$",
"^Retrato de un hombre$",
"^Kōtō shōgaku dokuhon$",
"^Shotōka kokugo$",
"^Shōgaku dokuhon$",
"^Jinjō shōgaku kokugo dokuhon$",
"^Shinsei kokugo dokuhon$",
"^Teikoku dokuhon$",
"^Instructions to Authors$",
"^KİTAP TAHLİLİ$",
"^PRZEGLĄD PIŚMIENNICTWA$",
"(?i)^Presentación$",
"^İçindekiler$",
"(?i)^Tabl?e of contents$",
"^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
"^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
"^Editorial( Board)?$",
"(?i)^Editorial \\(English\\)$",
"^Editörden$",
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
"^(Kiri Karl Morgensternile).*$",
"^(\\[Eksliibris Aleksandr).*\\]$",
"^(\\[Eksliibris Aleksandr).*$",
"^(Eksliibris Aleksandr).*$",
"^(Kiri A\\. de Vignolles).*$",
"^(2 kirja Karl Morgensternile).*$",
"^(Pirita kloostri idaosa arheoloogilised).*$",
"^(Kiri tundmatule).*$",
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
"^(Eksliibris Nikolai Birukovile).*$",
"^(Eksliibris Nikolai Issakovile).*$",
"^(WHP Cruise Summary Information of section).*$",
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*",
"(?i)^.*authors[']? reply\\.?$",
"(?i)^.*authors[']? response\\.?$"
]
},
"synonyms": {}
}
}

View File

@ -1,48 +0,0 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"orderField" : "title",
"queueMaxSize" : "2000",
"groupMaxSize" : "10",
"slidingWindowSize" : "200",
"idPath": "$.entity.id",
"rootBuilder" : [ "result" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} },
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }
],
"decisionTree": {},
"model" : [
{ "name" : "pid", "type" : "JSON", "path" : "$.entity.pid"},
{ "name" : "dateofacceptance", "type" : "String", "path" : "$.entity.result.metadata.dateofacceptance.value"},
{ "name" : "title", "type" : "String","path" : "$.entity.result.metadata.title[?(@.qualifier.classid ==\"main title\")].value" },
{ "name" : "authors", "type" : "List", "path" : "$.entity.result.metadata.author[*].fullname" }
],
"blacklists" : {
"title" : [
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
"^(Kiri Karl Morgensternile).*$",
"^(\\[Eksliibris Aleksandr).*\\]$",
"^(\\[Eksliibris Aleksandr).*$",
"^(Eksliibris Aleksandr).*$",
"^(Kiri A\\. de Vignolles).*$",
"^(2 kirja Karl Morgensternile).*$",
"^(Pirita kloostri idaosa arheoloogilised).*$",
"^(Kiri tundmatule).*$",
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
"^(Eksliibris Nikolai Birukovile).*$",
"^(Eksliibris Nikolai Issakovile).*$",
"^(WHP Cruise Summary Information of section).*$",
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*"
] } ,
"synonyms": {}
}
}

View File

@ -1,51 +0,0 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"orderField" : "title",
"queueMaxSize" : "2000",
"groupMaxSize" : "10",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} },
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }
],
"decisionTree": {
"start": {"fields": [{"field":"pid", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"true", "params": {}}], "threshold":1.0, "aggregation": "MAX", "positive":"MATCH", "negative":"layer2", "undefined":"layer2", "ignoreUndefined": "true"},
"layer2": {"fields": [{"field":"dateofacceptance", "comparator":"yearMatch", "weight":1.0, "countIfUndefined":"true", "params" : {}}, {"field":"title", "comparator":"titleVersionMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}, {"field":"authors", "comparator":"sizeMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}], "threshold":1.0, "aggregation": "MIN", "positive":"layer3", "negative":"NO_MATCH", "undefined":"layer3", "ignoreUndefined": "false"},
"layer3": {"fields": [{"field":"title", "comparator":"JaroWinkler", "weight":1.0, "countIfUndefined":"false", "params":{}}], "threshold": 0.99, "aggregation": "MAX", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "false"}
},
"model" : [
{ "name" : "pid", "type" : "String", "path" : "pid[qualifier#classid = {doi}]/value", "overrideMatch" : "true" },
{ "name" : "title", "type" : "String", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
{ "name" : "dateofacceptance", "type" : "String", "path" : "result/metadata/dateofacceptance/value" } ,
{ "name" : "authors", "type" : "List", "path" : "result/author/metadata/fullname/value" }
],
"blacklists" : {
"title" : [
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
"^(Kiri Karl Morgensternile).*$",
"^(\\[Eksliibris Aleksandr).*\\]$",
"^(\\[Eksliibris Aleksandr).*$",
"^(Eksliibris Aleksandr).*$",
"^(Kiri A\\. de Vignolles).*$",
"^(2 kirja Karl Morgensternile).*$",
"^(Pirita kloostri idaosa arheoloogilised).*$",
"^(Kiri tundmatule).*$",
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
"^(Eksliibris Nikolai Birukovile).*$",
"^(Eksliibris Nikolai Issakovile).*$",
"^(WHP Cruise Summary Information of section).*$",
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*"
] },
"synonyms": {}
}
}