implementation of new json comparator and update of the publication configuration
This commit is contained in:
parent
d09193a094
commit
159cb2a493
|
@ -13,6 +13,7 @@ import org.apache.spark.api.java.JavaRDD;
|
|||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.Before;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import scala.Tuple2;
|
||||
|
||||
|
@ -30,7 +31,7 @@ public class DedupLocalTest extends DedupTestUtils {
|
|||
@Before
|
||||
public void setup() {
|
||||
|
||||
config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/config/organization.strict.conf.json", DedupLocalTest.class));
|
||||
config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/config/publication.current.conf.json", DedupLocalTest.class));
|
||||
treeProcessor = new TreeProcessor(config);
|
||||
|
||||
final SparkSession spark = SparkSession
|
||||
|
@ -39,11 +40,13 @@ public class DedupLocalTest extends DedupTestUtils {
|
|||
.master("local[*]")
|
||||
.getOrCreate();
|
||||
context = new JavaSparkContext(spark.sparkContext());
|
||||
final URL dataset = getClass().getResource("/eu/dnetlib/pace/examples/organization.to.fix.json");
|
||||
|
||||
final URL dataset = getClass().getResource("/eu/dnetlib/pace/examples/publications.dump.1000.json");
|
||||
entities = context.textFile(dataset.getPath());
|
||||
|
||||
}
|
||||
|
||||
@Ignore
|
||||
@Test
|
||||
public void dedupTest(){
|
||||
|
||||
|
@ -57,6 +60,7 @@ public class DedupLocalTest extends DedupTestUtils {
|
|||
|
||||
}
|
||||
|
||||
@Ignore
|
||||
@Test
|
||||
public void relationsTest() {
|
||||
|
||||
|
@ -112,7 +116,7 @@ public class DedupLocalTest extends DedupTestUtils {
|
|||
|
||||
}
|
||||
|
||||
|
||||
@Ignore
|
||||
@Test
|
||||
public void matchTest(){
|
||||
|
||||
|
@ -128,7 +132,7 @@ public class DedupLocalTest extends DedupTestUtils {
|
|||
|
||||
}
|
||||
|
||||
|
||||
@Ignore
|
||||
@Test
|
||||
public void parseJSONEntityTest(){
|
||||
String jsonEntity = "{\"dateoftransformation\":\"2018-09-19\",\"originalId\":[\"doajarticles::Sociedade_Brasileira_de_Reumatologia\"],\"collectedfrom\":[{\"value\":\"DOAJ-Articles\",\"key\":\"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824\"}],\"organization\":{\"metadata\":{\"eclegalbody\":{\"value\":\"false\"},\"eclegalperson\":{\"value\":\"false\"},\"ecinternationalorganization\":{\"value\":\"false\"},\"legalshortname\":{\"value\":\"Sociedade Brasileira de Reumatologia\"},\"ecresearchorganization\":{\"value\":\"false\"},\"ecnonprofit\":{\"value\":\"false\"},\"ecenterprise\":{\"value\":\"false\"},\"ecnutscode\":{\"value\":\"false\"},\"ecinternationalorganizationeurinterests\":{\"value\":\"false\"},\"legalname\":{\"value\":\"Sociedade Brasileira de Reumatologia\"},\"country\":{\"classid\":\"BR\",\"classname\":\"Brazil\",\"schemename\":\"dnet:countries\",\"schemeid\":\"dnet:countries\"},\"echighereducation\":{\"value\":\"false\"},\"ecsmevalidated\":{\"value\":\"false\"}}},\"dateofcollection\":\"2018-09-19\",\"type\":20,\"id\":\"20|doajarticles::0019ba7a22c5bc733c3206bde28ff568\"}";
|
||||
|
|
|
@ -0,0 +1,387 @@
|
|||
{
|
||||
"wf": {
|
||||
"threshold": "0.99",
|
||||
"dedupRun": "001",
|
||||
"entityType": "result",
|
||||
"subEntityType": "resulttype",
|
||||
"subEntityValue": "publication",
|
||||
"orderField": "title",
|
||||
"queueMaxSize": "2000",
|
||||
"groupMaxSize": "100",
|
||||
"maxChildren": "100",
|
||||
"slidingWindowSize": "200",
|
||||
"rootBuilder": [
|
||||
"result",
|
||||
"resultProject_outcome_isProducedBy",
|
||||
"resultResult_publicationDataset_isRelatedTo",
|
||||
"resultResult_similarity_isAmongTopNSimilarDocuments",
|
||||
"resultResult_similarity_hasAmongTopNSimilarDocuments",
|
||||
"resultOrganization_affiliation_isAffiliatedWith",
|
||||
"resultResult_part_hasPart",
|
||||
"resultResult_part_isPartOf",
|
||||
"resultResult_supplement_isSupplementTo",
|
||||
"resultResult_supplement_isSupplementedBy",
|
||||
"resultResult_version_isVersionOf"
|
||||
],
|
||||
"includeChildren": "true",
|
||||
"maxIterations": 20,
|
||||
"idPath": "$.id"
|
||||
},
|
||||
"pace": {
|
||||
"clustering": [
|
||||
{
|
||||
"name": "ngrampairs",
|
||||
"fields": [
|
||||
"title"
|
||||
],
|
||||
"params": {
|
||||
"max": "1",
|
||||
"ngramLen": "3"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "suffixprefix",
|
||||
"fields": [
|
||||
"title"
|
||||
],
|
||||
"params": {
|
||||
"max": "1",
|
||||
"len": "3"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "lowercase",
|
||||
"fields": [
|
||||
"doi"
|
||||
],
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"decisionTree": {
|
||||
"start": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "pid",
|
||||
"comparator": "jsonListMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {
|
||||
"threshold": "0.5",
|
||||
"jpath_value": "$.value",
|
||||
"jpath_classid": "$.qualifier.classid"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 1.0,
|
||||
"aggregation": "MAX",
|
||||
"positive": "MATCH",
|
||||
"negative": "layer2",
|
||||
"undefined": "layer2",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"layer2": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "title",
|
||||
"comparator": "titleVersionMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
},
|
||||
{
|
||||
"field": "authors",
|
||||
"comparator": "sizeMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 1.0,
|
||||
"aggregation": "NC",
|
||||
"positive": "layer3",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "layer3",
|
||||
"ignoreUndefined": "false"
|
||||
},
|
||||
"layer3": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "title",
|
||||
"comparator": "levensteinTitle",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "true",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 0.99,
|
||||
"aggregation": "SUM",
|
||||
"positive": "MATCH",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "NO_MATCH",
|
||||
"ignoreUndefined": "true"
|
||||
}
|
||||
},
|
||||
"model": [
|
||||
{
|
||||
"name": "doi",
|
||||
"type": "String",
|
||||
"path": "$.pid[?(@.qualifier.classid == 'doi')].value"
|
||||
},
|
||||
{
|
||||
"name": "pid",
|
||||
"type": "JSON",
|
||||
"path": "$.pid[*]",
|
||||
"overrideMatch": "true"
|
||||
},
|
||||
{
|
||||
"name": "title",
|
||||
"type": "String",
|
||||
"path": "$.title[?(@.qualifier.classid == 'main title')].value",
|
||||
"length": 250,
|
||||
"size": 5
|
||||
},
|
||||
{
|
||||
"name": "authors",
|
||||
"type": "List",
|
||||
"path": "$.author[*].fullname[*]",
|
||||
"size": 200
|
||||
},
|
||||
{
|
||||
"name": "resulttype",
|
||||
"type": "String",
|
||||
"path": "$.resulttype.classid"
|
||||
}
|
||||
],
|
||||
"blacklists": {
|
||||
"title": [
|
||||
"^Inside Front Cover$",
|
||||
"(?i)^Poster presentations$",
|
||||
"^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
|
||||
"^Problems with perinatal pathology\\.?$",
|
||||
"(?i)^Cases? of Puerperal Convulsions$",
|
||||
"(?i)^Operative Gyna?ecology$",
|
||||
"(?i)^Mind the gap\\!?\\:?$",
|
||||
"^Chronic fatigue syndrome\\.?$",
|
||||
"^Cartas? ao editor Letters? to the Editor$",
|
||||
"^Note from the Editor$",
|
||||
"^Anesthesia Abstract$",
|
||||
"^Annual report$",
|
||||
"(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
|
||||
"(?i)^Graph and Table of Infectious Diseases?$",
|
||||
"^Presentation$",
|
||||
"(?i)^Reviews and Information on Publications$",
|
||||
"(?i)^PUBLIC HEALTH SERVICES?$",
|
||||
"(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
|
||||
"(?i)^Adrese autora$",
|
||||
"(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
|
||||
"(?i)^Acknowledgement to Referees$",
|
||||
"(?i)^Behçet's disease\\.?$",
|
||||
"(?i)^Isolation and identification of restriction endonuclease.*$",
|
||||
"(?i)^CEREBROVASCULAR DISEASES?.?$",
|
||||
"(?i)^Screening for abdominal aortic aneurysms?\\.?$",
|
||||
"^Event management$",
|
||||
"(?i)^Breakfast and Crohn's disease.*\\.?$",
|
||||
"^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
|
||||
"(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
|
||||
"^Gushi hakubutsugaku$",
|
||||
"^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
|
||||
"^Intestinal spirocha?etosis$",
|
||||
"^Treatment of Rodent Ulcer$",
|
||||
"(?i)^\\W*Cloud Computing\\W*$",
|
||||
"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
|
||||
"^Free Communications, Poster Presentations: Session [A-F]$",
|
||||
"^“The Historical Aspects? of Quackery\\.?”$",
|
||||
"^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
|
||||
"^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
|
||||
"(?i)^Case Report$",
|
||||
"^Boletín Informativo$",
|
||||
"(?i)^Glioblastoma Multiforme$",
|
||||
"(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
|
||||
"^Zaměstnanecké výhody$",
|
||||
"(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
|
||||
"(?i)^Carotid body tumours?\\.?$",
|
||||
"(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
|
||||
"^Avant-propos$",
|
||||
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
|
||||
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
|
||||
"(?i)^PUBLIC HEALTH VERSUS THE STATE$",
|
||||
"^Viñetas de Cortázar$",
|
||||
"(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$",
|
||||
"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
|
||||
"(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
|
||||
"(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
|
||||
"(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
|
||||
"^Aus der AGMB$",
|
||||
"^Znanstveno-stručni prilozi$",
|
||||
"(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
|
||||
"(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
|
||||
"(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
|
||||
"^Finanční analýza podniku$",
|
||||
"^Financial analysis( of business)?$",
|
||||
"(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
|
||||
"^Jikken nihon shūshinsho$",
|
||||
"(?i)^CORONER('|s)(s|') INQUESTS$",
|
||||
"(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
|
||||
"(?i)^Consultants' contract(s)?$",
|
||||
"(?i)^Upute autorima$",
|
||||
"(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
|
||||
"^Joshi shin kokubun$",
|
||||
"^Kōtō shōgaku dokuhon nōson'yō$",
|
||||
"^Jinjō shōgaku shōka$",
|
||||
"^Shōgaku shūjichō$",
|
||||
"^Nihon joshi dokuhon$",
|
||||
"^Joshi shin dokuhon$",
|
||||
"^Chūtō kanbun dokuhon$",
|
||||
"^Wabun dokuhon$",
|
||||
"(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
|
||||
"(?i)^cardiac rehabilitation$",
|
||||
"(?i)^Analytical summary$",
|
||||
"^Thesaurus resolutionum Sacrae Congregationis Concilii$",
|
||||
"(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
|
||||
"^Prikazi i osvrti$",
|
||||
"^Rodinný dům s provozovnou$",
|
||||
"^Family house with an establishment$",
|
||||
"^Shinsei chūtō shin kokugun$",
|
||||
"^Pulmonary alveolar proteinosis(\\.?)$",
|
||||
"^Shinshū kanbun$",
|
||||
"^Viñeta(s?) de Rodríguez$",
|
||||
"(?i)^RUBRIKA UREDNIKA$",
|
||||
"^A Matching Model of the Academic Publication Market$",
|
||||
"^Yōgaku kōyō$",
|
||||
"^Internetový marketing$",
|
||||
"^Internet marketing$",
|
||||
"^Chūtō kokugo dokuhon$",
|
||||
"^Kokugo dokuhon$",
|
||||
"^Antibiotic Cover for Dental Extraction(s?)$",
|
||||
"^Strategie podniku$",
|
||||
"^Strategy of an Enterprise$",
|
||||
"(?i)^respiratory disease(s?)(\\.?)$",
|
||||
"^Award(s?) for Gallantry in Civil Defence$",
|
||||
"^Podniková kultura$",
|
||||
"^Corporate Culture$",
|
||||
"^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$",
|
||||
"^Pracovní motivace$",
|
||||
"^Work Motivation$",
|
||||
"^Kaitei kōtō jogaku dokuhon$",
|
||||
"^Konsolidovaná účetní závěrka$",
|
||||
"^Consolidated Financial Statements$",
|
||||
"(?i)^intracranial tumour(s?)$",
|
||||
"^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
|
||||
"^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
|
||||
"^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
|
||||
"^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
|
||||
"^Úroveň motivačního procesu jako způsobu vedení lidí$",
|
||||
"^The level of motivation process as a leadership$",
|
||||
"^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
|
||||
"(?i)^news and events$",
|
||||
"(?i)^NOVOSTI I DOGAĐAJI$",
|
||||
"^Sansū no gakushū$",
|
||||
"^Posouzení informačního systému firmy a návrh změn$",
|
||||
"^Information System Assessment and Proposal for ICT Modification$",
|
||||
"^Stresové zatížení pracovníků ve vybrané profesi$",
|
||||
"^Stress load in a specific job$",
|
||||
"^Sunday: Poster Sessions, Pt.*$",
|
||||
"^Monday: Poster Sessions, Pt.*$",
|
||||
"^Wednesday: Poster Sessions, Pt.*",
|
||||
"^Tuesday: Poster Sessions, Pt.*$",
|
||||
"^Analýza reklamy$",
|
||||
"^Analysis of advertising$",
|
||||
"^Shōgaku shūshinsho$",
|
||||
"^Shōgaku sansū$",
|
||||
"^Shintei joshi kokubun$",
|
||||
"^Taishō joshi kokubun dokuhon$",
|
||||
"^Joshi kokubun$",
|
||||
"^Účetní uzávěrka a účetní závěrka v ČR$",
|
||||
"(?i)^The \"?Causes\"? of Cancer$",
|
||||
"^Normas para la publicación de artículos$",
|
||||
"^Editor('|s)(s|') [Rr]eply$",
|
||||
"^Editor(’|s)(s|’) letter$",
|
||||
"^Redaktoriaus žodis$",
|
||||
"^DISCUSSION ON THE PRECEDING PAPER$",
|
||||
"^Kōtō shōgaku shūshinsho jidōyō$",
|
||||
"^Shōgaku nihon rekishi$",
|
||||
"^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
|
||||
"^Préface$",
|
||||
"^Occupational [Hh]ealth [Ss]ervices.$",
|
||||
"^In Memoriam Professor Toshiyuki TAKESHIMA$",
|
||||
"^Účetní závěrka ve vybraném podniku.*$",
|
||||
"^Financial statements in selected company$",
|
||||
"^Abdominal [Aa]ortic [Aa]neurysms.*$",
|
||||
"^Pseudomyxoma peritonei$",
|
||||
"^Kazalo autora$",
|
||||
"(?i)^uvodna riječ$",
|
||||
"^Motivace jako způsob vedení lidí$",
|
||||
"^Motivation as a leadership$",
|
||||
"^Polyfunkční dům$",
|
||||
"^Multi\\-funkcional building$",
|
||||
"^Podnikatelský plán$",
|
||||
"(?i)^Podnikatelský záměr$",
|
||||
"(?i)^Business Plan$",
|
||||
"^Oceňování nemovitostí$",
|
||||
"^Marketingová komunikace$",
|
||||
"^Marketing communication$",
|
||||
"^Sumario Analítico$",
|
||||
"^Riječ uredništva$",
|
||||
"^Savjetovanja i priredbe$",
|
||||
"^Índice$",
|
||||
"^(Starobosanski nadpisi).*$",
|
||||
"^Vzdělávání pracovníků v organizaci$",
|
||||
"^Staff training in organization$",
|
||||
"^(Life Histories of North American Geometridae).*$",
|
||||
"^Strategická analýza podniku$",
|
||||
"^Strategic Analysis of an Enterprise$",
|
||||
"^Sadržaj$",
|
||||
"^Upute suradnicima$",
|
||||
"^Rodinný dům$",
|
||||
"(?i)^Fami(l)?ly house$",
|
||||
"^Upute autorima$",
|
||||
"^Strategic Analysis$",
|
||||
"^Finanční analýza vybraného podniku$",
|
||||
"^Finanční analýza$",
|
||||
"^Riječ urednika$",
|
||||
"(?i)^Content(s?)$",
|
||||
"(?i)^Inhalt$",
|
||||
"^Jinjō shōgaku shūshinsho jidōyō$",
|
||||
"(?i)^Index$",
|
||||
"^Chūgaku kokubun kyōkasho$",
|
||||
"^Retrato de una mujer$",
|
||||
"^Retrato de un hombre$",
|
||||
"^Kōtō shōgaku dokuhon$",
|
||||
"^Shotōka kokugo$",
|
||||
"^Shōgaku dokuhon$",
|
||||
"^Jinjō shōgaku kokugo dokuhon$",
|
||||
"^Shinsei kokugo dokuhon$",
|
||||
"^Teikoku dokuhon$",
|
||||
"^Instructions to Authors$",
|
||||
"^KİTAP TAHLİLİ$",
|
||||
"^PRZEGLĄD PIŚMIENNICTWA$",
|
||||
"(?i)^Presentación$",
|
||||
"^İçindekiler$",
|
||||
"(?i)^Tabl?e of contents$",
|
||||
"^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
|
||||
"^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
|
||||
"^Editorial( Board)?$",
|
||||
"(?i)^Editorial \\(English\\)$",
|
||||
"^Editörden$",
|
||||
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
|
||||
"^(Kiri Karl Morgensternile).*$",
|
||||
"^(\\[Eksliibris Aleksandr).*\\]$",
|
||||
"^(\\[Eksliibris Aleksandr).*$",
|
||||
"^(Eksliibris Aleksandr).*$",
|
||||
"^(Kiri A\\. de Vignolles).*$",
|
||||
"^(2 kirja Karl Morgensternile).*$",
|
||||
"^(Pirita kloostri idaosa arheoloogilised).*$",
|
||||
"^(Kiri tundmatule).*$",
|
||||
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
|
||||
"^(Eksliibris Nikolai Birukovile).*$",
|
||||
"^(Eksliibris Nikolai Issakovile).*$",
|
||||
"^(WHP Cruise Summary Information of section).*$",
|
||||
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
|
||||
"^(Measurement of the spin\\-dependent structure function).*",
|
||||
"(?i)^.*authors['’′]? reply\\.?$",
|
||||
"(?i)^.*authors['’′]? response\\.?$"
|
||||
]
|
||||
},
|
||||
"synonyms": {}
|
||||
}
|
||||
}
|
File diff suppressed because one or more lines are too long
|
@ -1,57 +0,0 @@
|
|||
package eu.dnetlib.pace.model.adaptor;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.base.Function;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.reflect.TypeToken;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
/**
|
||||
* Created by claudio on 01/03/16.
|
||||
*/
|
||||
public class Pid {
|
||||
|
||||
private static final Log log = LogFactory.getLog(Pid.class);
|
||||
|
||||
private String value;
|
||||
|
||||
private String type;
|
||||
|
||||
public static List<Pid> fromOafJson(final List<String> json) {
|
||||
|
||||
log.debug(String.format("\nPid: %s", json));
|
||||
|
||||
final GsonBuilder gb = new GsonBuilder();
|
||||
gb.registerTypeAdapter(Pid.class, new PidOafSerialiser());
|
||||
final Gson gson = gb.create();
|
||||
|
||||
return Lists.newArrayList(Iterables.transform(json, new Function<String, Pid>() {
|
||||
@Override
|
||||
public Pid apply(final String s) {
|
||||
return gson.fromJson(s, Pid.class);
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(final String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public void setValue(final String value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,50 +0,0 @@
|
|||
package eu.dnetlib.pace.model.adaptor;
|
||||
|
||||
import java.lang.reflect.Type;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.gson.*;
|
||||
import eu.dnetlib.pace.model.gt.GTAuthor;
|
||||
|
||||
/**
|
||||
* Created by claudio on 01/03/16.
|
||||
*/
|
||||
public class PidOafSerialiser implements JsonDeserializer<Pid> {
|
||||
|
||||
private static final String VALUE = "value";
|
||||
|
||||
private static final String QUALIFIER = "qualifier";
|
||||
private static final String CLASSID = "classid";
|
||||
|
||||
@Override
|
||||
public Pid deserialize(final JsonElement json, final Type typeOfT, final JsonDeserializationContext context) throws JsonParseException {
|
||||
|
||||
final Pid pid = new Pid();
|
||||
|
||||
pid.setType(getType(json));
|
||||
pid.setValue(getValue(json));
|
||||
|
||||
return pid;
|
||||
}
|
||||
|
||||
private String getValue(final JsonElement json) {
|
||||
final JsonObject obj =json.getAsJsonObject();
|
||||
return obj.get(VALUE).getAsString();
|
||||
|
||||
}
|
||||
|
||||
private String getType(final JsonElement json) {
|
||||
|
||||
final JsonObject obj =json.getAsJsonObject();
|
||||
|
||||
if (!obj.has(QUALIFIER))
|
||||
throw new IllegalArgumentException("pid does not contain any type: " + json.toString());
|
||||
|
||||
final JsonObject qualifier = obj.getAsJsonObject(QUALIFIER);
|
||||
|
||||
final JsonElement classid = qualifier.get(CLASSID);
|
||||
|
||||
return classid.getAsString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,70 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@ComparatorClass("jsonListMatch")
|
||||
public class JsonListMatch extends AbstractComparator {
|
||||
|
||||
private static final Log log = LogFactory.getLog(JsonListMatch.class);
|
||||
private Map<String, String> params;
|
||||
|
||||
public JsonListMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b, final Config conf) {
|
||||
|
||||
final List<String> sa = ((FieldList) a).stringList();
|
||||
final List<String> sb = ((FieldList) b).stringList();
|
||||
|
||||
if (sa.isEmpty() || sb.isEmpty()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
final Set<String> ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||
final Set<String> cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||
|
||||
int incommon = Sets.intersection(ca, cb).size();
|
||||
int simDiff = Sets.symmetricDifference(ca, cb).size();
|
||||
|
||||
if (incommon + simDiff == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
return (double)incommon / (incommon + simDiff) > Double.parseDouble(params.getOrDefault("threshold", "0.5")) ? 1 : 0;
|
||||
|
||||
}
|
||||
|
||||
//converts every json into a comparable string basing on parameters
|
||||
private String toComparableString(String json){
|
||||
|
||||
StringBuilder st = new StringBuilder(); //to build the string used for comparisons basing on the jpath into parameters
|
||||
|
||||
//for each path in the param list
|
||||
for (String key: params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) {
|
||||
String path = params.get(key);
|
||||
String value = MapDocumentUtil.getJPathString(path, json);
|
||||
if (value == null || value.isEmpty())
|
||||
value = "";
|
||||
st.append( value + "::");
|
||||
}
|
||||
|
||||
st.setLength(st.length()-2);
|
||||
return st.toString();
|
||||
}
|
||||
}
|
|
@ -1,64 +0,0 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
import eu.dnetlib.pace.model.adaptor.Pid;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@ComparatorClass("pidMatch")
|
||||
public class PidMatch extends AbstractComparator {
|
||||
|
||||
private static final Log log = LogFactory.getLog(PidMatch.class);
|
||||
private Map<String, String> params;
|
||||
|
||||
public PidMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b, final Config conf) {
|
||||
|
||||
final List<String> sa = ((FieldList) a).stringList();
|
||||
final List<String> sb = ((FieldList) b).stringList();
|
||||
|
||||
final List<Pid> pal = Pid.fromOafJson(sa);
|
||||
final List<Pid> pbl = Pid.fromOafJson(sb);
|
||||
|
||||
if (pal.isEmpty() || pbl.isEmpty()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
final Set<String> pidAset = toHashSet(pal);
|
||||
final Set<String> pidBset = toHashSet(pbl);
|
||||
|
||||
int incommon = Sets.intersection(pidAset, pidBset).size();
|
||||
int simDiff = Sets.symmetricDifference(pidAset, pidBset).size();
|
||||
|
||||
if (incommon + simDiff == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
return (double)incommon / (incommon + simDiff) > Double.parseDouble(params.getOrDefault("threshold", "0.5")) ? 1 : 0;
|
||||
|
||||
}
|
||||
|
||||
//lowercase + normalization of the pid before adding it to the set
|
||||
private Set<String> toHashSet(List<Pid> pbl) {
|
||||
|
||||
return pbl.stream()
|
||||
.map(pid -> pid.getType() + normalizePid(pid.getValue()))
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
@ComparatorClass("stringListMatch")
|
||||
public class StringListMatch extends AbstractComparator {
|
||||
|
||||
private static final Log log = LogFactory.getLog(StringListMatch.class);
|
||||
private Map<String, String> params;
|
||||
|
||||
public StringListMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b, final Config conf) {
|
||||
|
||||
final Set<String> pa = new HashSet<>(((FieldList) a).stringList());
|
||||
final Set<String> pb = new HashSet<>(((FieldList) b).stringList());
|
||||
|
||||
if (pa.isEmpty() || pb.isEmpty()) {
|
||||
return -1; //return undefined if one of the two lists of pids is empty
|
||||
}
|
||||
|
||||
int incommon = Sets.intersection(pa, pb).size();
|
||||
int simDiff = Sets.symmetricDifference(pa, pb).size();
|
||||
|
||||
if (incommon + simDiff == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
return (double)incommon / (incommon + simDiff) > Double.parseDouble(params.getOrDefault("threshold", "0.5")) ? 1 : 0;
|
||||
|
||||
}
|
||||
}
|
|
@ -2,7 +2,9 @@ package eu.dnetlib.pace.util;
|
|||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.jayway.jsonpath.Configuration;
|
||||
import com.jayway.jsonpath.JsonPath;
|
||||
import com.jayway.jsonpath.Option;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
@ -55,7 +57,7 @@ public class MapDocumentUtil {
|
|||
|
||||
public static List<String> getJPathList(String path, String json, Type type) {
|
||||
if (type == Type.List)
|
||||
return JsonPath.read(json, path);
|
||||
return JsonPath.using(Configuration.defaultConfiguration().addOptions(Option.ALWAYS_RETURN_LIST, Option.SUPPRESS_EXCEPTIONS)).parse(json).read(path);
|
||||
Object jresult;
|
||||
List<String> result = new ArrayList<>();
|
||||
try {
|
||||
|
|
|
@ -18,7 +18,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
@Before
|
||||
public void setUp() throws Exception {
|
||||
params = Maps.newHashMap();
|
||||
conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf", ClusteringFunctionTest.class));
|
||||
conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ClusteringFunctionTest.class));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -110,15 +110,6 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPersonClustering2() {
|
||||
final ClusteringFunction cf = new PersonClustering(params);
|
||||
|
||||
final String s = readFromClasspath("gt.author.json");
|
||||
System.out.println(s);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(person(s))));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testKeywordsClustering() {
|
||||
|
||||
|
|
|
@ -4,7 +4,6 @@ import eu.dnetlib.pace.clustering.NGramUtils;
|
|||
import eu.dnetlib.pace.tree.*;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import org.junit.Before;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
|
@ -24,7 +23,7 @@ public class ComparatorTest extends AbstractPaceFunctions {
|
|||
public void setup() {
|
||||
params = new HashMap<>();
|
||||
params.put("weight", "1.0");
|
||||
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf", ComparatorTest.class));
|
||||
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class));
|
||||
|
||||
}
|
||||
|
||||
|
@ -115,5 +114,9 @@ public class ComparatorTest extends AbstractPaceFunctions {
|
|||
System.out.println("result = " + result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void jsonListMatchTest() {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -2,11 +2,15 @@ package eu.dnetlib.pace.config;
|
|||
|
||||
|
||||
import eu.dnetlib.pace.AbstractPaceTest;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.tree.JsonListMatch;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
@ -16,7 +20,7 @@ public class ConfigTest extends AbstractPaceTest {
|
|||
|
||||
@Test
|
||||
public void dedupConfigSerializationTest() {
|
||||
final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("organization.current.conf"));
|
||||
final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("organization.current.conf.json"));
|
||||
|
||||
final String conf = cfgFromClasspath.toString();
|
||||
|
||||
|
@ -26,13 +30,12 @@ public class ConfigTest extends AbstractPaceTest {
|
|||
|
||||
assertNotNull(cfgFromClasspath);
|
||||
assertNotNull(cfgFromSerialization);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void dedupConfigTest() {
|
||||
|
||||
DedupConfig load = DedupConfig.load(readFromClasspath("organization.current.conf"));
|
||||
DedupConfig load = DedupConfig.load(readFromClasspath("organization.current.conf.json"));
|
||||
|
||||
System.out.println(load.toString());
|
||||
}
|
||||
|
@ -40,7 +43,7 @@ public class ConfigTest extends AbstractPaceTest {
|
|||
@Test
|
||||
public void initTranslationMapTest() {
|
||||
|
||||
DedupConfig load = DedupConfig.load(readFromClasspath("organization.current.conf"));
|
||||
DedupConfig load = DedupConfig.load(readFromClasspath("organization.current.conf.json"));
|
||||
|
||||
Map<String, String> translationMap = load.translationMap();
|
||||
|
||||
|
@ -50,38 +53,26 @@ public class ConfigTest extends AbstractPaceTest {
|
|||
if (translationMap.get(key).equals("key::1"))
|
||||
System.out.println("key = " + key);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void emptyTranslationMapTest() {
|
||||
|
||||
DedupConfig load = DedupConfig.load(readFromClasspath("organization.no_synonyms.conf"));
|
||||
DedupConfig load = DedupConfig.load(readFromClasspath("organization.no_synonyms.conf.json"));
|
||||
|
||||
assertEquals(0, load.getPace().translationMap().keySet().size());
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
public void testAsMapDocumentJPath() throws Exception {
|
||||
|
||||
DedupConfig load = DedupConfig.load(readFromClasspath("result.pace.conf_jpath.json"));
|
||||
|
||||
|
||||
System.out.println(load.getWf().getIdPath());
|
||||
|
||||
final String result =IOUtils.toString(this.getClass().getResourceAsStream("result.json"));
|
||||
|
||||
System.out.println(result);
|
||||
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(load, result);
|
||||
|
||||
System.out.println(mapDocument.getFieldMap());
|
||||
|
||||
}
|
||||
public void asMapDocumentTest() throws Exception {
|
||||
|
||||
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json"));
|
||||
|
||||
final String json = readFromClasspath("publication.json");
|
||||
|
||||
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
|
||||
|
||||
System.out.println("mapDocument = " + mapDocument.getFieldMap());
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,126 +0,0 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
public class PersonComparatorUtilsNGramsTest {
|
||||
|
||||
@Test
|
||||
public void testNormaizePerson_1() {
|
||||
verifyGetNgramsForPerson("Artini Michele", 2, "a_michele", "m_artini");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormaizePerson_2() {
|
||||
verifyGetNgramsForPerson("Michele Artini", 2, "a_michele", "m_artini");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormaizePerson_3() {
|
||||
verifyGetNgramsForPerson("Michele ARTINI", 1, "m_artini");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormaizePerson_4() {
|
||||
verifyGetNgramsForPerson("ARTINI Michele", 1, "m_artini");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormaizePerson_5() {
|
||||
verifyGetNgramsForPerson("Michele G. Artini", 2, "m_artini", "g_artini");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormaizePerson_6() {
|
||||
verifyGetNgramsForPerson(" Artini, Michele ", 1, "m_artini");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormaizePerson_7() {
|
||||
verifyGetNgramsForPerson("Artini, Michele (sig.)", 1, "m_artini");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormaizePerson_8() {
|
||||
verifyGetNgramsForPerson("Artini Michele [sig.] ", 2, "a_michele", "m_artini");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormaizePerson_9() {
|
||||
verifyGetNgramsForPerson("Artini, M", 1, "m_artini");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormaizePerson_10() {
|
||||
verifyGetNgramsForPerson("Artini, M.", 1, "m_artini");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormaizePerson_11() {
|
||||
verifyGetNgramsForPerson("Artini, M. (sig.)", 1, "m_artini");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormaizePerson_12() {
|
||||
verifyGetNgramsForPerson("Artini, M[sig.] ", 1, "m_artini");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormaizePerson_13() {
|
||||
verifyGetNgramsForPerson("Artini-SIG, Michele ", 1, "m_artini-sig");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormaizePerson_14() {
|
||||
verifyGetNgramsForPerson("Artini - SIG, Michele ", 1, "m_artini-sig");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormaizePerson_15() {
|
||||
verifyGetNgramsForPerson("Artini {sig.}, M", 1, "m_artini");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormaizePerson_16() {
|
||||
verifyGetNgramsForPerson("Artini, M., sig.", 1, "m_artini");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormaizePerson_17() {
|
||||
verifyGetNgramsForPerson("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA, BBBBBBBBBBBBBBBBBBBBBBBBBBBBB CCCCCCCCCCCCCCCCCCCC", 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormaizePerson_18() {
|
||||
verifyGetNgramsForPerson("Dell'amico, Andrea", 1, "a_amico");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormaizePerson_19() {
|
||||
verifyGetNgramsForPerson("Smith, Paul van der", 1, "p_smith");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormaizePerson_20() {
|
||||
verifyGetNgramsForPerson("AAAAAAA, BBBB, CCCC, DDDD, EEEE", 1, "b_aaaaaaa");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormaizePerson_21() {
|
||||
verifyGetNgramsForPerson("Kompetenzzentrum Informelle Bildung (KIB),", 6);
|
||||
}
|
||||
|
||||
private void verifyGetNgramsForPerson(String name, int expectedSize, String... expectedTokens) {
|
||||
Set<String> list = PersonComparatorUtils.getNgramsForPerson(name);
|
||||
System.out.println(list);
|
||||
assertEquals(expectedSize, list.size());
|
||||
for (String s : expectedTokens) {
|
||||
assertTrue(list.contains(s));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,89 +0,0 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
public class PersonComparatorUtilsSimilarityTest {
|
||||
|
||||
@Test
|
||||
public void testSimilarity_0() {
|
||||
assertTrue(PersonComparatorUtils.areSimilar("Artini Michele", "Michele Artini"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimilarity_1() {
|
||||
assertTrue(PersonComparatorUtils.areSimilar("ARTINI Michele", "Artini, Michele"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimilarity_2() {
|
||||
assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Artini Michele"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimilarity_3() {
|
||||
assertTrue(PersonComparatorUtils.areSimilar("Artini, M.G.", "Artini, Michele"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimilarity_4() {
|
||||
assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, M.G."));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimilarity_5() {
|
||||
assertTrue(PersonComparatorUtils.areSimilar("Artini, M. (sig.)", "Artini, Michele"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimilarity_6() {
|
||||
assertFalse(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, G."));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimilarity_7() {
|
||||
assertFalse(PersonComparatorUtils.areSimilar("Artini, M.G.", "Artini, M.A."));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimilarity_8() {
|
||||
assertFalse(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, Giuseppe"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimilarity_9() {
|
||||
assertFalse(PersonComparatorUtils.areSimilar("Manghi, Paolo", "Artini, Michele"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimilarity_10() {
|
||||
assertTrue(PersonComparatorUtils.areSimilar("Artini, Michele", "Artini, Michele Giovanni"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimilarity_11() {
|
||||
assertFalse(PersonComparatorUtils.areSimilar("Artini, M.A.G.", "Artini, M.B.G."));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimilarity_12() {
|
||||
assertFalse(PersonComparatorUtils.areSimilar("Artini Manghi, M.", "Artini, Michele"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimilarity_13() {
|
||||
assertTrue(PersonComparatorUtils.areSimilar("Artini Manghi, M.", "Artini Manghi Michele"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimilarity_14() {
|
||||
assertFalse(PersonComparatorUtils.areSimilar("Artini, Michele", "Michele, Artini"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimilarity_15() {
|
||||
assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Michele ARTINI"));
|
||||
}
|
||||
}
|
|
@ -1,111 +0,0 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.text.Normalizer;
|
||||
import java.util.Queue;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
public class PersonTest {
|
||||
|
||||
@Test
|
||||
public void test_1() {
|
||||
check("Atzori, Claudio", "Atzori, Claudio");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_2() {
|
||||
check("Atzori, Claudio A.", "Atzori, Claudio A.");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_3() {
|
||||
check("Claudio ATZORI", "Atzori, Claudio");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_4() {
|
||||
check("ATZORI, Claudio", "Atzori, Claudio");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_5() {
|
||||
check("Claudio Atzori", "Claudio Atzori");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_6() {
|
||||
check(" Manghi , Paolo", "Manghi, Paolo");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_7() {
|
||||
check("ATZORI, CLAUDIO", "Atzori, Claudio");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_8() {
|
||||
check("ATZORI, CLAUDIO A", "Atzori, Claudio A.");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_9() {
|
||||
check("Bølviken, B.", "Bølviken, B.");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_10() {
|
||||
check("Bñlviken, B.", "B" + Normalizer.normalize("ñ", Normalizer.Form.NFD) + "lviken, B.");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_11() {
|
||||
check("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰ ø", "Aaeeiioooouuuu, Aaeeiioooouuuu Ø.", true);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_12() {
|
||||
check("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰz ø", Normalizer.normalize("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰz ø", Normalizer.Form.NFD), false);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_13() {
|
||||
check("Tkačíková, Daniela", Normalizer.normalize("Tkačíková, Daniela", Normalizer.Form.NFD), false);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_hashes() {
|
||||
checkHash(" Claudio ATZORI ", "ATZORI Claudio", "Atzori , Claudio", "ATZORI, Claudio");
|
||||
}
|
||||
|
||||
private void checkHash(String... ss) {
|
||||
Queue<String> q = Lists.newLinkedList(Lists.newArrayList(ss));
|
||||
String h1 = new Person(q.remove(), false).hash();
|
||||
while (!q.isEmpty()) {
|
||||
assertEquals(h1, new Person(q.remove(), false).hash());
|
||||
}
|
||||
}
|
||||
|
||||
private void check(String s, String expectedFullName) {
|
||||
check(s, expectedFullName, false);
|
||||
}
|
||||
|
||||
private void check(String s, String expectedFullName, boolean aggressive) {
|
||||
Person p = new Person(s, aggressive);
|
||||
|
||||
System.out.println("original: " + p.getOriginal());
|
||||
System.out.println("accurate: " + p.isAccurate());
|
||||
System.out.println("normalised: '" + p.getNormalisedFullname() + "'");
|
||||
if (p.isAccurate()) {
|
||||
System.out.println("name: " + p.getNormalisedFirstName());
|
||||
System.out.println("surname: " + p.getNormalisedSurname());
|
||||
}
|
||||
System.out.println("hash: " + p.hash());
|
||||
System.out.println("");
|
||||
assertEquals(expectedFullName, p.getNormalisedFullname());
|
||||
}
|
||||
|
||||
}
|
File diff suppressed because one or more lines are too long
|
@ -18,16 +18,81 @@
|
|||
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
|
||||
],
|
||||
"decisionTree" : {
|
||||
"start": {"fields": [{"field":"gridid", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"true", "params": {}}], "threshold":1.0, "aggregation": "MAX", "positive":"MATCH", "negative":"layer2", "undefined":"layer2", "ignoreUndefined": "true"},
|
||||
"layer2": {"fields": [{"field":"websiteurl", "comparator":"domainExactMatch", "weight":1.0, "countIfUndefined":"true", "params" : {}}, {"field":"country", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}], "threshold":1.0, "aggregation": "MIN", "positive":"layer3", "negative":"NO_MATCH", "undefined":"layer3", "ignoreUndefined": "false"},
|
||||
"layer3": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "countIfUndefined":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "countIfUndefined":"true", "params":{}}], "threshold": 0.9, "aggregation": "W_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "true"}
|
||||
"start": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "gridid",
|
||||
"comparator": "exactMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "true",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 1.0,
|
||||
"aggregation": "MAX",
|
||||
"positive": "MATCH",
|
||||
"negative": "layer2",
|
||||
"undefined": "layer2",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"layer2": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "websiteurl",
|
||||
"comparator": "domainExactMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "true",
|
||||
"params": {}
|
||||
},
|
||||
{
|
||||
"field": "country",
|
||||
"comparator": "exactMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 1.0,
|
||||
"aggregation": "MIN",
|
||||
"positive": "layer3",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "layer3",
|
||||
"ignoreUndefined": "false"
|
||||
},
|
||||
"layer3": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "legalname",
|
||||
"comparator": "jaroWinklerNormalizedName",
|
||||
"weight": 0.9,
|
||||
"countIfUndefined": "false",
|
||||
"params": {
|
||||
"windowSize": 4,
|
||||
"threshold": 0.7
|
||||
}
|
||||
},
|
||||
{
|
||||
"field": "legalshortname",
|
||||
"comparator": "jaroWinklerNormalizedName",
|
||||
"weight": 0.1,
|
||||
"countIfUndefined": "true",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 0.9,
|
||||
"aggregation": "W_MEAN",
|
||||
"positive": "MATCH",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "NO_MATCH",
|
||||
"ignoreUndefined": "true"
|
||||
}
|
||||
},
|
||||
"model" : [
|
||||
{ "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"},
|
||||
{ "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"},
|
||||
{ "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" },
|
||||
{ "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" },
|
||||
{ "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"}
|
||||
{ "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"},
|
||||
{ "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"},
|
||||
{ "name" : "legalname", "type" : "String", "path" : "$organization.metadata.legalname.value" },
|
||||
{ "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" },
|
||||
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid ==\"grid\")].value"}
|
||||
],
|
||||
"blacklists" : {
|
||||
"legalname" : []
|
|
@ -23,11 +23,11 @@
|
|||
"layer3": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "countIfUndefined":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "countIfUndefined":"true", "params":{}}], "threshold": 0.9, "aggregation": "W_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "true"}
|
||||
},
|
||||
"model" : [
|
||||
{ "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"},
|
||||
{ "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"},
|
||||
{ "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" },
|
||||
{ "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" },
|
||||
{ "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"}
|
||||
{ "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"},
|
||||
{ "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"},
|
||||
{ "name" : "legalname", "type" : "String", "path" : "$organization.metadata.legalname.value" },
|
||||
{ "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" },
|
||||
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid ==\"grid\")].value"}
|
||||
],
|
||||
"blacklists" : {
|
||||
"legalname" : []
|
|
@ -0,0 +1,387 @@
|
|||
{
|
||||
"wf": {
|
||||
"threshold": "0.99",
|
||||
"dedupRun": "001",
|
||||
"entityType": "result",
|
||||
"subEntityType": "resulttype",
|
||||
"subEntityValue": "publication",
|
||||
"orderField": "title",
|
||||
"queueMaxSize": "2000",
|
||||
"groupMaxSize": "100",
|
||||
"maxChildren": "100",
|
||||
"slidingWindowSize": "200",
|
||||
"rootBuilder": [
|
||||
"result",
|
||||
"resultProject_outcome_isProducedBy",
|
||||
"resultResult_publicationDataset_isRelatedTo",
|
||||
"resultResult_similarity_isAmongTopNSimilarDocuments",
|
||||
"resultResult_similarity_hasAmongTopNSimilarDocuments",
|
||||
"resultOrganization_affiliation_isAffiliatedWith",
|
||||
"resultResult_part_hasPart",
|
||||
"resultResult_part_isPartOf",
|
||||
"resultResult_supplement_isSupplementTo",
|
||||
"resultResult_supplement_isSupplementedBy",
|
||||
"resultResult_version_isVersionOf"
|
||||
],
|
||||
"includeChildren": "true",
|
||||
"maxIterations": 20,
|
||||
"idPath": "$.entity.id"
|
||||
},
|
||||
"pace": {
|
||||
"clustering": [
|
||||
{
|
||||
"name": "ngrampairs",
|
||||
"fields": [
|
||||
"title"
|
||||
],
|
||||
"params": {
|
||||
"max": "1",
|
||||
"ngramLen": "3"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "suffixprefix",
|
||||
"fields": [
|
||||
"title"
|
||||
],
|
||||
"params": {
|
||||
"max": "1",
|
||||
"len": "3"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "lowercase",
|
||||
"fields": [
|
||||
"doi"
|
||||
],
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"decisionTree": {
|
||||
"start": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "pid",
|
||||
"comparator": "jsonListMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {
|
||||
"threshold": "0.5",
|
||||
"jpath_value": "$.value",
|
||||
"jpath_classid": "$.qualifier.classid"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 1.0,
|
||||
"aggregation": "MAX",
|
||||
"positive": "MATCH",
|
||||
"negative": "layer2",
|
||||
"undefined": "layer2",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"layer2": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "title",
|
||||
"comparator": "titleVersionMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
},
|
||||
{
|
||||
"field": "authors",
|
||||
"comparator": "sizeMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 1.0,
|
||||
"aggregation": "NC",
|
||||
"positive": "layer3",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "layer3",
|
||||
"ignoreUndefined": "false"
|
||||
},
|
||||
"layer3": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "title",
|
||||
"comparator": "LevensteinTitle",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "true",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 0.99,
|
||||
"aggregation": "SUM",
|
||||
"positive": "MATCH",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "NO_MATCH",
|
||||
"ignoreUndefined": "true"
|
||||
}
|
||||
},
|
||||
"model": [
|
||||
{
|
||||
"name": "doi",
|
||||
"type": "String",
|
||||
"path": "$.pid[?(@.qualifier.classid == 'doi')].value"
|
||||
},
|
||||
{
|
||||
"name": "pid",
|
||||
"type": "JSON",
|
||||
"path": "$.pid[*]",
|
||||
"overrideMatch": "true"
|
||||
},
|
||||
{
|
||||
"name": "title",
|
||||
"type": "String",
|
||||
"path": "$.title[?(@.qualifier.classid == 'main title')].value",
|
||||
"length": 250,
|
||||
"size": 5
|
||||
},
|
||||
{
|
||||
"name": "authors",
|
||||
"type": "List",
|
||||
"path": "$.author[*].fullname",
|
||||
"size": 200
|
||||
},
|
||||
{
|
||||
"name": "resulttype",
|
||||
"type": "String",
|
||||
"path": "$.resulttype.classid"
|
||||
}
|
||||
],
|
||||
"blacklists": {
|
||||
"title": [
|
||||
"^Inside Front Cover$",
|
||||
"(?i)^Poster presentations$",
|
||||
"^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
|
||||
"^Problems with perinatal pathology\\.?$",
|
||||
"(?i)^Cases? of Puerperal Convulsions$",
|
||||
"(?i)^Operative Gyna?ecology$",
|
||||
"(?i)^Mind the gap\\!?\\:?$",
|
||||
"^Chronic fatigue syndrome\\.?$",
|
||||
"^Cartas? ao editor Letters? to the Editor$",
|
||||
"^Note from the Editor$",
|
||||
"^Anesthesia Abstract$",
|
||||
"^Annual report$",
|
||||
"(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
|
||||
"(?i)^Graph and Table of Infectious Diseases?$",
|
||||
"^Presentation$",
|
||||
"(?i)^Reviews and Information on Publications$",
|
||||
"(?i)^PUBLIC HEALTH SERVICES?$",
|
||||
"(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
|
||||
"(?i)^Adrese autora$",
|
||||
"(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
|
||||
"(?i)^Acknowledgement to Referees$",
|
||||
"(?i)^Behçet's disease\\.?$",
|
||||
"(?i)^Isolation and identification of restriction endonuclease.*$",
|
||||
"(?i)^CEREBROVASCULAR DISEASES?.?$",
|
||||
"(?i)^Screening for abdominal aortic aneurysms?\\.?$",
|
||||
"^Event management$",
|
||||
"(?i)^Breakfast and Crohn's disease.*\\.?$",
|
||||
"^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
|
||||
"(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
|
||||
"^Gushi hakubutsugaku$",
|
||||
"^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
|
||||
"^Intestinal spirocha?etosis$",
|
||||
"^Treatment of Rodent Ulcer$",
|
||||
"(?i)^\\W*Cloud Computing\\W*$",
|
||||
"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
|
||||
"^Free Communications, Poster Presentations: Session [A-F]$",
|
||||
"^“The Historical Aspects? of Quackery\\.?”$",
|
||||
"^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
|
||||
"^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
|
||||
"(?i)^Case Report$",
|
||||
"^Boletín Informativo$",
|
||||
"(?i)^Glioblastoma Multiforme$",
|
||||
"(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
|
||||
"^Zaměstnanecké výhody$",
|
||||
"(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
|
||||
"(?i)^Carotid body tumours?\\.?$",
|
||||
"(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
|
||||
"^Avant-propos$",
|
||||
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
|
||||
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
|
||||
"(?i)^PUBLIC HEALTH VERSUS THE STATE$",
|
||||
"^Viñetas de Cortázar$",
|
||||
"(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$",
|
||||
"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
|
||||
"(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
|
||||
"(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
|
||||
"(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
|
||||
"^Aus der AGMB$",
|
||||
"^Znanstveno-stručni prilozi$",
|
||||
"(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
|
||||
"(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
|
||||
"(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
|
||||
"^Finanční analýza podniku$",
|
||||
"^Financial analysis( of business)?$",
|
||||
"(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
|
||||
"^Jikken nihon shūshinsho$",
|
||||
"(?i)^CORONER('|s)(s|') INQUESTS$",
|
||||
"(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
|
||||
"(?i)^Consultants' contract(s)?$",
|
||||
"(?i)^Upute autorima$",
|
||||
"(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
|
||||
"^Joshi shin kokubun$",
|
||||
"^Kōtō shōgaku dokuhon nōson'yō$",
|
||||
"^Jinjō shōgaku shōka$",
|
||||
"^Shōgaku shūjichō$",
|
||||
"^Nihon joshi dokuhon$",
|
||||
"^Joshi shin dokuhon$",
|
||||
"^Chūtō kanbun dokuhon$",
|
||||
"^Wabun dokuhon$",
|
||||
"(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
|
||||
"(?i)^cardiac rehabilitation$",
|
||||
"(?i)^Analytical summary$",
|
||||
"^Thesaurus resolutionum Sacrae Congregationis Concilii$",
|
||||
"(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
|
||||
"^Prikazi i osvrti$",
|
||||
"^Rodinný dům s provozovnou$",
|
||||
"^Family house with an establishment$",
|
||||
"^Shinsei chūtō shin kokugun$",
|
||||
"^Pulmonary alveolar proteinosis(\\.?)$",
|
||||
"^Shinshū kanbun$",
|
||||
"^Viñeta(s?) de Rodríguez$",
|
||||
"(?i)^RUBRIKA UREDNIKA$",
|
||||
"^A Matching Model of the Academic Publication Market$",
|
||||
"^Yōgaku kōyō$",
|
||||
"^Internetový marketing$",
|
||||
"^Internet marketing$",
|
||||
"^Chūtō kokugo dokuhon$",
|
||||
"^Kokugo dokuhon$",
|
||||
"^Antibiotic Cover for Dental Extraction(s?)$",
|
||||
"^Strategie podniku$",
|
||||
"^Strategy of an Enterprise$",
|
||||
"(?i)^respiratory disease(s?)(\\.?)$",
|
||||
"^Award(s?) for Gallantry in Civil Defence$",
|
||||
"^Podniková kultura$",
|
||||
"^Corporate Culture$",
|
||||
"^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$",
|
||||
"^Pracovní motivace$",
|
||||
"^Work Motivation$",
|
||||
"^Kaitei kōtō jogaku dokuhon$",
|
||||
"^Konsolidovaná účetní závěrka$",
|
||||
"^Consolidated Financial Statements$",
|
||||
"(?i)^intracranial tumour(s?)$",
|
||||
"^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
|
||||
"^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
|
||||
"^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
|
||||
"^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
|
||||
"^Úroveň motivačního procesu jako způsobu vedení lidí$",
|
||||
"^The level of motivation process as a leadership$",
|
||||
"^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
|
||||
"(?i)^news and events$",
|
||||
"(?i)^NOVOSTI I DOGAĐAJI$",
|
||||
"^Sansū no gakushū$",
|
||||
"^Posouzení informačního systému firmy a návrh změn$",
|
||||
"^Information System Assessment and Proposal for ICT Modification$",
|
||||
"^Stresové zatížení pracovníků ve vybrané profesi$",
|
||||
"^Stress load in a specific job$",
|
||||
"^Sunday: Poster Sessions, Pt.*$",
|
||||
"^Monday: Poster Sessions, Pt.*$",
|
||||
"^Wednesday: Poster Sessions, Pt.*",
|
||||
"^Tuesday: Poster Sessions, Pt.*$",
|
||||
"^Analýza reklamy$",
|
||||
"^Analysis of advertising$",
|
||||
"^Shōgaku shūshinsho$",
|
||||
"^Shōgaku sansū$",
|
||||
"^Shintei joshi kokubun$",
|
||||
"^Taishō joshi kokubun dokuhon$",
|
||||
"^Joshi kokubun$",
|
||||
"^Účetní uzávěrka a účetní závěrka v ČR$",
|
||||
"(?i)^The \"?Causes\"? of Cancer$",
|
||||
"^Normas para la publicación de artículos$",
|
||||
"^Editor('|s)(s|') [Rr]eply$",
|
||||
"^Editor(’|s)(s|’) letter$",
|
||||
"^Redaktoriaus žodis$",
|
||||
"^DISCUSSION ON THE PRECEDING PAPER$",
|
||||
"^Kōtō shōgaku shūshinsho jidōyō$",
|
||||
"^Shōgaku nihon rekishi$",
|
||||
"^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
|
||||
"^Préface$",
|
||||
"^Occupational [Hh]ealth [Ss]ervices.$",
|
||||
"^In Memoriam Professor Toshiyuki TAKESHIMA$",
|
||||
"^Účetní závěrka ve vybraném podniku.*$",
|
||||
"^Financial statements in selected company$",
|
||||
"^Abdominal [Aa]ortic [Aa]neurysms.*$",
|
||||
"^Pseudomyxoma peritonei$",
|
||||
"^Kazalo autora$",
|
||||
"(?i)^uvodna riječ$",
|
||||
"^Motivace jako způsob vedení lidí$",
|
||||
"^Motivation as a leadership$",
|
||||
"^Polyfunkční dům$",
|
||||
"^Multi\\-funkcional building$",
|
||||
"^Podnikatelský plán$",
|
||||
"(?i)^Podnikatelský záměr$",
|
||||
"(?i)^Business Plan$",
|
||||
"^Oceňování nemovitostí$",
|
||||
"^Marketingová komunikace$",
|
||||
"^Marketing communication$",
|
||||
"^Sumario Analítico$",
|
||||
"^Riječ uredništva$",
|
||||
"^Savjetovanja i priredbe$",
|
||||
"^Índice$",
|
||||
"^(Starobosanski nadpisi).*$",
|
||||
"^Vzdělávání pracovníků v organizaci$",
|
||||
"^Staff training in organization$",
|
||||
"^(Life Histories of North American Geometridae).*$",
|
||||
"^Strategická analýza podniku$",
|
||||
"^Strategic Analysis of an Enterprise$",
|
||||
"^Sadržaj$",
|
||||
"^Upute suradnicima$",
|
||||
"^Rodinný dům$",
|
||||
"(?i)^Fami(l)?ly house$",
|
||||
"^Upute autorima$",
|
||||
"^Strategic Analysis$",
|
||||
"^Finanční analýza vybraného podniku$",
|
||||
"^Finanční analýza$",
|
||||
"^Riječ urednika$",
|
||||
"(?i)^Content(s?)$",
|
||||
"(?i)^Inhalt$",
|
||||
"^Jinjō shōgaku shūshinsho jidōyō$",
|
||||
"(?i)^Index$",
|
||||
"^Chūgaku kokubun kyōkasho$",
|
||||
"^Retrato de una mujer$",
|
||||
"^Retrato de un hombre$",
|
||||
"^Kōtō shōgaku dokuhon$",
|
||||
"^Shotōka kokugo$",
|
||||
"^Shōgaku dokuhon$",
|
||||
"^Jinjō shōgaku kokugo dokuhon$",
|
||||
"^Shinsei kokugo dokuhon$",
|
||||
"^Teikoku dokuhon$",
|
||||
"^Instructions to Authors$",
|
||||
"^KİTAP TAHLİLİ$",
|
||||
"^PRZEGLĄD PIŚMIENNICTWA$",
|
||||
"(?i)^Presentación$",
|
||||
"^İçindekiler$",
|
||||
"(?i)^Tabl?e of contents$",
|
||||
"^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
|
||||
"^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
|
||||
"^Editorial( Board)?$",
|
||||
"(?i)^Editorial \\(English\\)$",
|
||||
"^Editörden$",
|
||||
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
|
||||
"^(Kiri Karl Morgensternile).*$",
|
||||
"^(\\[Eksliibris Aleksandr).*\\]$",
|
||||
"^(\\[Eksliibris Aleksandr).*$",
|
||||
"^(Eksliibris Aleksandr).*$",
|
||||
"^(Kiri A\\. de Vignolles).*$",
|
||||
"^(2 kirja Karl Morgensternile).*$",
|
||||
"^(Pirita kloostri idaosa arheoloogilised).*$",
|
||||
"^(Kiri tundmatule).*$",
|
||||
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
|
||||
"^(Eksliibris Nikolai Birukovile).*$",
|
||||
"^(Eksliibris Nikolai Issakovile).*$",
|
||||
"^(WHP Cruise Summary Information of section).*$",
|
||||
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
|
||||
"^(Measurement of the spin\\-dependent structure function).*",
|
||||
"(?i)^.*authors['’′]? reply\\.?$",
|
||||
"(?i)^.*authors['’′]? response\\.?$"
|
||||
]
|
||||
},
|
||||
"synonyms": {}
|
||||
}
|
||||
}
|
|
@ -1,48 +0,0 @@
|
|||
{
|
||||
"wf" : {
|
||||
"threshold" : "0.99",
|
||||
"dedupRun" : "001",
|
||||
"entityType" : "result",
|
||||
"orderField" : "title",
|
||||
"queueMaxSize" : "2000",
|
||||
"groupMaxSize" : "10",
|
||||
"slidingWindowSize" : "200",
|
||||
"idPath": "$.entity.id",
|
||||
"rootBuilder" : [ "result" ],
|
||||
"includeChildren" : "true"
|
||||
},
|
||||
"pace" : {
|
||||
"clustering" : [
|
||||
{ "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} },
|
||||
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }
|
||||
],
|
||||
"decisionTree": {},
|
||||
"model" : [
|
||||
{ "name" : "pid", "type" : "JSON", "path" : "$.entity.pid"},
|
||||
{ "name" : "dateofacceptance", "type" : "String", "path" : "$.entity.result.metadata.dateofacceptance.value"},
|
||||
{ "name" : "title", "type" : "String","path" : "$.entity.result.metadata.title[?(@.qualifier.classid ==\"main title\")].value" },
|
||||
{ "name" : "authors", "type" : "List", "path" : "$.entity.result.metadata.author[*].fullname" }
|
||||
],
|
||||
"blacklists" : {
|
||||
"title" : [
|
||||
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
|
||||
"^(Kiri Karl Morgensternile).*$",
|
||||
"^(\\[Eksliibris Aleksandr).*\\]$",
|
||||
"^(\\[Eksliibris Aleksandr).*$",
|
||||
"^(Eksliibris Aleksandr).*$",
|
||||
"^(Kiri A\\. de Vignolles).*$",
|
||||
"^(2 kirja Karl Morgensternile).*$",
|
||||
"^(Pirita kloostri idaosa arheoloogilised).*$",
|
||||
"^(Kiri tundmatule).*$",
|
||||
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
|
||||
"^(Eksliibris Nikolai Birukovile).*$",
|
||||
"^(Eksliibris Nikolai Issakovile).*$",
|
||||
"^(WHP Cruise Summary Information of section).*$",
|
||||
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
|
||||
"^(Measurement of the spin\\-dependent structure function).*"
|
||||
] } ,
|
||||
"synonyms": {}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,51 +0,0 @@
|
|||
{
|
||||
"wf" : {
|
||||
"threshold" : "0.99",
|
||||
"dedupRun" : "001",
|
||||
"entityType" : "result",
|
||||
"orderField" : "title",
|
||||
"queueMaxSize" : "2000",
|
||||
"groupMaxSize" : "10",
|
||||
"slidingWindowSize" : "200",
|
||||
"rootBuilder" : [ "result" ],
|
||||
"includeChildren" : "true"
|
||||
},
|
||||
"pace" : {
|
||||
"clustering" : [
|
||||
{ "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} },
|
||||
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }
|
||||
],
|
||||
"decisionTree": {
|
||||
"start": {"fields": [{"field":"pid", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"true", "params": {}}], "threshold":1.0, "aggregation": "MAX", "positive":"MATCH", "negative":"layer2", "undefined":"layer2", "ignoreUndefined": "true"},
|
||||
"layer2": {"fields": [{"field":"dateofacceptance", "comparator":"yearMatch", "weight":1.0, "countIfUndefined":"true", "params" : {}}, {"field":"title", "comparator":"titleVersionMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}, {"field":"authors", "comparator":"sizeMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}], "threshold":1.0, "aggregation": "MIN", "positive":"layer3", "negative":"NO_MATCH", "undefined":"layer3", "ignoreUndefined": "false"},
|
||||
"layer3": {"fields": [{"field":"title", "comparator":"JaroWinkler", "weight":1.0, "countIfUndefined":"false", "params":{}}], "threshold": 0.99, "aggregation": "MAX", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "false"}
|
||||
},
|
||||
"model" : [
|
||||
{ "name" : "pid", "type" : "String", "path" : "pid[qualifier#classid = {doi}]/value", "overrideMatch" : "true" },
|
||||
{ "name" : "title", "type" : "String", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
|
||||
{ "name" : "dateofacceptance", "type" : "String", "path" : "result/metadata/dateofacceptance/value" } ,
|
||||
{ "name" : "authors", "type" : "List", "path" : "result/author/metadata/fullname/value" }
|
||||
],
|
||||
"blacklists" : {
|
||||
"title" : [
|
||||
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
|
||||
"^(Kiri Karl Morgensternile).*$",
|
||||
"^(\\[Eksliibris Aleksandr).*\\]$",
|
||||
"^(\\[Eksliibris Aleksandr).*$",
|
||||
"^(Eksliibris Aleksandr).*$",
|
||||
"^(Kiri A\\. de Vignolles).*$",
|
||||
"^(2 kirja Karl Morgensternile).*$",
|
||||
"^(Pirita kloostri idaosa arheoloogilised).*$",
|
||||
"^(Kiri tundmatule).*$",
|
||||
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
|
||||
"^(Eksliibris Nikolai Birukovile).*$",
|
||||
"^(Eksliibris Nikolai Issakovile).*$",
|
||||
"^(WHP Cruise Summary Information of section).*$",
|
||||
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
|
||||
"^(Measurement of the spin\\-dependent structure function).*"
|
||||
] },
|
||||
"synonyms": {}
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue