From fbb1b66bfba175d67693415885cd88f13ead909a Mon Sep 17 00:00:00 2001 From: miconis Date: Mon, 13 Sep 2021 14:53:19 +0200 Subject: [PATCH] dedup test implementation & graph drawing tools --- .../pace/util/BlockProcessorForTesting.java | 232 ++++++++++ .../clustering/ClusteringFunctionTest.java | 17 +- .../pace/comparators/ComparatorTest.java | 38 +- .../eu/dnetlib/pace/config/ConfigTest.java | 18 + .../eu/dnetlib/pace/config/pub.prod.conf.json | 402 ++++++++++++++++++ .../pace/config/publication.example.json | 1 + release.properties | 11 - 7 files changed, 705 insertions(+), 14 deletions(-) create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java create mode 100644 dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/pub.prod.conf.json create mode 100644 dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.example.json delete mode 100644 release.properties diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java new file mode 100644 index 000000000..9bf05f37b --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java @@ -0,0 +1,232 @@ +package eu.dnetlib.pace.util; + +import com.google.common.collect.Lists; +import eu.dnetlib.pace.clustering.NGramUtils; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.config.WfConfig; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.model.MapDocumentComparator; +import eu.dnetlib.pace.tree.JsonListMatch; +import eu.dnetlib.pace.tree.LevensteinTitle; +import eu.dnetlib.pace.tree.SizeMatch; +import eu.dnetlib.pace.tree.TitleVersionMatch; +import eu.dnetlib.pace.tree.support.FieldStats; +import eu.dnetlib.pace.tree.support.TreeProcessor; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import java.util.*; + +public class BlockProcessorForTesting { + + public static final List accumulators= new ArrayList<>(); + + private static final Log log = LogFactory.getLog(eu.dnetlib.pace.util.BlockProcessorForTesting.class); + + private DedupConfig dedupConf; + + public static void constructAccumulator( final DedupConfig dedupConf) { + accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "records per hash key = 1")); + accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField())); + accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize()))); + accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "skip list")); + accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)")); + accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold())); + } + + public BlockProcessorForTesting(DedupConfig dedupConf) { + this.dedupConf = dedupConf; + } + + public void processSortedBlock(final String key, final List documents, final Reporter context, boolean useTree) { + if (documents.size() > 1) { +// log.info("reducing key: '" + key + "' records: " + q.size()); + process(prepare(documents), context, useTree); + + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); + } + } + + public void process(final String key, final Iterable documents, final Reporter context, boolean useTree) { + + final Queue q = prepare(documents); + + if (q.size() > 1) { +// log.info("reducing key: '" + key + "' records: " + q.size()); + process(simplifyQueue(q, key, context), context, useTree); + + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); + } + } + + private Queue prepare(final Iterable documents) { + final Queue queue = new PriorityQueue<>(100, new MapDocumentComparator(dedupConf.getWf().getOrderField())); + + final Set seen = new HashSet(); + final int queueMaxSize = dedupConf.getWf().getQueueMaxSize(); + + documents.forEach(doc -> { + if (queue.size() <= queueMaxSize) { + final String id = doc.getIdentifier(); + + if (!seen.contains(id)) { + seen.add(id); + queue.add(doc); + } + } + }); + + return queue; + } + + private Queue simplifyQueue(final Queue queue, final String ngram, final Reporter context) { + final Queue q = new LinkedList<>(); + + String fieldRef = ""; + final List tempResults = Lists.newArrayList(); + + while (!queue.isEmpty()) { + final MapDocument result = queue.remove(); + + final String orderFieldName = dedupConf.getWf().getOrderField(); + final Field orderFieldValue = result.values(orderFieldName); + if (!orderFieldValue.isEmpty()) { + final String field = NGramUtils.cleanupForOrdering(orderFieldValue.stringValue()); + if (field.equals(fieldRef)) { + tempResults.add(result); + } else { + populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram); + tempResults.clear(); + tempResults.add(result); + fieldRef = field; + } + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField(), 1); + } + } + populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram); + + return q; + } + + private void populateSimplifiedQueue(final Queue q, + final List tempResults, + final Reporter context, + final String fieldRef, + final String ngram) { + WfConfig wf = dedupConf.getWf(); + if (tempResults.size() < wf.getGroupMaxSize()) { + q.addAll(tempResults); + } else { + context.incrementCounter(wf.getEntityType(), String.format("Skipped records for count(%s) >= %s", wf.getOrderField(), wf.getGroupMaxSize()), tempResults.size()); +// log.info("Skipped field: " + fieldRef + " - size: " + tempResults.size() + " - ngram: " + ngram); + } + } + + private void process(final Queue queue, final Reporter context, boolean useTree) { + + while (!queue.isEmpty()) { + + final MapDocument pivot = queue.remove(); + final String idPivot = pivot.getIdentifier(); + + WfConfig wf = dedupConf.getWf(); + final Field fieldsPivot = pivot.values(wf.getOrderField()); + final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? "" : fieldsPivot.stringValue(); + + if (fieldPivot != null) { + int i = 0; + for (final MapDocument curr : queue) { + final String idCurr = curr.getIdentifier(); + + if (mustSkip(idCurr)) { + + context.incrementCounter(wf.getEntityType(), "skip list", 1); + + break; + } + + if (i > wf.getSlidingWindowSize()) { + break; + } + + final Field fieldsCurr = curr.values(wf.getOrderField()); + final String fieldCurr = (fieldsCurr == null) || fieldsCurr.isEmpty() ? null : fieldsCurr.stringValue(); + + if (!idCurr.equals(idPivot) && (fieldCurr != null)) { + +// if (new TreeProcessor(dedupConf).compare(pivot, curr) == true && publicationCompare(pivot, curr, dedupConf) == false) +// emitOutput(true, idPivot, idCurr, context); +// + if(useTree) + emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context); + else + emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context); + + } + } + } + } + } + + private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) { + + double score = 0.0; + //LAYER 1 - comparison of the PIDs json lists + Map params = new HashMap<>(); + params.put("jpath_value", "$.value"); + params.put("jpath_classid", "$.qualifier.classid"); + JsonListMatch jsonListMatch = new JsonListMatch(params); + double result = jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config); + if (result >= 0.5) //if the result of the comparison is greater than the threshold + score += 10.0; //high score because it should match when the first condition is satisfied + else + score += 0.0; + + //LAYER 2 - comparison of the title version and the size of the authors lists + TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params); + double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config); + SizeMatch sizeMatch = new SizeMatch(params); + double result2 = sizeMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config); + if (Math.min(result1, result2) != 0) + score+=0; + else + score-=2; + + //LAYER 3 - computation of levenshtein on titles + LevensteinTitle levensteinTitle = new LevensteinTitle(params); + double result3 = levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config); + score += Double.isNaN(result3)?0.0:result3; + + return score >= 0.99; + } + + private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) { + + if (result) { + writeSimilarity(context, idPivot, idCurr); + context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1); + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1); + } + } + + private boolean mustSkip(final String idPivot) { + return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot)); + } + + private String getNsPrefix(final String id) { + return StringUtils.substringBetween(id, "|", "::"); + } + + private void writeSimilarity(final Reporter context, final String from, final String to) { + final String type = dedupConf.getWf().getEntityType(); + + context.emit(type, from, to); + context.emit(type, to, from); + } +} diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index 91a327474..8657b5804 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -47,7 +47,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { @Test public void testNgramPairs() { params.put("ngramLen", 3); - params.put("max", 1); + params.put("max", 2); final ClusteringFunction np = new NgramPairs(params); @@ -59,7 +59,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { @Test public void testSortedNgramPairs() { params.put("ngramLen", 3); - params.put("max", 1); + params.put("max", 2); final ClusteringFunction np = new SortedNgramPairs(params); @@ -70,6 +70,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s2 = "Pisa University"; System.out.println(s2); System.out.println(np.apply(conf, Lists.newArrayList(title(s2)))); + + final String s3 = "Parco Tecnologico Agroalimentare Umbria"; + System.out.println(s3); + System.out.println(np.apply(conf, Lists.newArrayList(title(s3)))); + } @Test @@ -132,6 +137,14 @@ public class ClusteringFunctionTest extends AbstractPaceTest { System.out.println(s); System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); + s = "JRC Open Power Plants Database (JRC-PPDB-OPEN)"; + System.out.println(s); + System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); + + s = "JRC Open Power Plants Database"; + System.out.println(s); + System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); + } @Test diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index 060526bfb..6bdd1ad45 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -14,7 +14,7 @@ import eu.dnetlib.pace.common.AbstractPaceFunctions; import java.util.HashMap; import java.util.Map; - +@TestInstance(TestInstance.Lifecycle.PER_CLASS) public class ComparatorTest extends AbstractPaceFunctions { private Map params; @@ -119,6 +119,42 @@ public class ComparatorTest extends AbstractPaceFunctions { } + @Test + public void jaroWinklerTest() { + + final JaroWinkler jaroWinkler = new JaroWinkler(params); + + double result = jaroWinkler.distance("Sofia", "Sofìa", conf); + System.out.println("result = " + result); + + result = jaroWinkler.distance("University of Victoria Dataverse", "University of Windsor Dataverse", conf); + System.out.println("result = " + result); + + result = jaroWinkler.distance("Victoria Dataverse", "Windsor Dataverse", conf); + System.out.println("result = " + result); + + final Levenstein levenstein = new Levenstein(params); + + result = levenstein.distance("Victoria", "Windsor", conf); + System.out.println("result = " + result); + + //University of Victoria Dataverse + //University of British Columbia Dataverse + //University of Windsor Dataverse + //University of Waterloo Dataverse + //University of Toronto Dataverse + //University of Ottawa Dataverse + } + + @Test + public void levensteinTitleTest() { + + final LevensteinTitle levensteinTitle = new LevensteinTitle(params); + double result = levensteinTitle.distance("JRC: Open Power Plants Database", "JRC Open Power Plants Database (JRC-PPDB-OPEN)", conf); + + System.out.println("result = " + result); + } + @Test public void jsonListMatchTest(){ diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java index dc7b11a8a..dbf7f08a9 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -2,6 +2,7 @@ package eu.dnetlib.pace.config; import eu.dnetlib.pace.AbstractPaceTest; +import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; import eu.dnetlib.pace.clustering.ClusteringClass; import eu.dnetlib.pace.clustering.ClusteringCombiner; import eu.dnetlib.pace.model.Field; @@ -128,6 +129,23 @@ public class ConfigTest extends AbstractPaceTest { assertEquals("doi", combine[2].split(":")[1]); } + @Test + public void filterAndCombineTest() { + + DedupConfig dedupConf = DedupConfig.load(readFromClasspath("pub.prod.conf.json")); + + final String json = readFromClasspath("publication.example.json"); + + final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json); + + Collection strings = BlacklistAwareClusteringCombiner.filterAndCombine(mapDocument, dedupConf); + + for (String s: strings) { + System.out.println("s = " + s); + } + + } + @Test public void crossCompareTest() { diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/pub.prod.conf.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/pub.prod.conf.json new file mode 100644 index 000000000..ab34ed877 --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/pub.prod.conf.json @@ -0,0 +1,402 @@ + +{ + "wf": { + "threshold": "0.99", + "dedupRun": "001", + "entityType": "result", + "subEntityType": "resulttype", + "subEntityValue": "publication", + "orderField": "title", + "queueMaxSize": "200", + "groupMaxSize": "100", + "maxChildren": "100", + "slidingWindowSize": "50", + "rootBuilder": [ + "result", + "resultProject_outcome_isProducedBy", + "resultResult_publicationDataset_isRelatedTo", + "resultResult_similarity_isAmongTopNSimilarDocuments", + "resultResult_similarity_hasAmongTopNSimilarDocuments", + "resultOrganization_affiliation_isAffiliatedWith", + "resultResult_part_hasPart", + "resultResult_part_isPartOf", + "resultResult_supplement_isSupplementTo", + "resultResult_supplement_isSupplementedBy", + "resultResult_version_isVersionOf" + ], + "includeChildren": "true", + "maxIterations": 20, + "idPath": "$.id" + }, + "pace": { + "clustering" : [ + { "name" : "wordsStatsSuffixPrefixChain", "fields" : [ "title" ], "params" : { "mod" : "10" } }, + { "name" : "lowercase", "fields" : [ "doi", "altdoi" ], "params" : { "collapseOn:pid": "0"} } + ], + "decisionTree": { + "start": { + "fields": [ + { + "field": "pid", + "comparator": "jsonListMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": { + "jpath_value": "$.value", + "jpath_classid": "$.qualifier.classid" + } + }, + { + "field": "pid", + "comparator": "jsonListMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": { + "jpath_value": "$.value", + "jpath_classid": "$.qualifier.classid", + "crossCompare": "alternateid" + } + } + ], + "threshold": 0.5, + "aggregation": "MAX", + "positive": "layer1", + "negative": "layer2", + "undefined": "layer2", + "ignoreUndefined": "true" + }, + "layer1": { + "fields": [ + { + "field": "title", + "comparator": "levensteinTitle", + "weight": 1.0, + "countIfUndefined": "true", + "params": {} + } + ], + "threshold": 0.9, + "aggregation": "AVG", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "NO_MATCH", + "ignoreUndefined": "true" + }, + "layer2": { + "fields": [ + { + "field": "title", + "comparator": "titleVersionMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": {} + }, + { + "field": "authors", + "comparator": "sizeMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": {} + } + ], + "threshold": 1.0, + "aggregation": "AND", + "positive": "layer3", + "negative": "NO_MATCH", + "undefined": "layer3", + "ignoreUndefined": "false" + }, + "layer3": { + "fields": [ + { + "field": "title", + "comparator": "levensteinTitle", + "weight": 1.0, + "countIfUndefined": "true", + "params": {} + } + ], + "threshold": 0.99, + "aggregation": "AVG", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "NO_MATCH", + "ignoreUndefined": "true" + } + }, + "model": [ + { + "name": "doi", + "type": "String", + "path": "$.instance.pid[?(@.qualifier.classid == 'doi')].value" + }, + { + "name": "altdoi", + "type": "String", + "path": "$.instance.alternateIdentifier[?(@.qualifier.classid == 'doi')].value" + }, + { + "name": "pid", + "type": "JSON", + "path": "$.instance.pid", + "overrideMatch": "true" + }, + { + "name": "alternateid", + "type": "JSON", + "path": "$.instance.alternateIdentifier", + "overrideMatch": "true" + }, + { + "name": "title", + "type": "String", + "path": "$.title[?(@.qualifier.classid == 'main title')].value", + "length": 250, + "size": 5 + }, + { + "name": "authors", + "type": "List", + "path": "$.author[*].fullname", + "size": 200 + }, + { + "name": "resulttype", + "type": "String", + "path": "$.resulttype.classid" + } + ], + "blacklists": { + "title": [ + "(?i)^Data Management Plan", + "^Inside Front Cover$", + "(?i)^Poster presentations$", + "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$", + "^Problems with perinatal pathology\\.?$", + "(?i)^Cases? of Puerperal Convulsions$", + "(?i)^Operative Gyna?ecology$", + "(?i)^Mind the gap\\!?\\:?$", + "^Chronic fatigue syndrome\\.?$", + "^Cartas? ao editor Letters? to the Editor$", + "^Note from the Editor$", + "^Anesthesia Abstract$", + "^Annual report$", + "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$", + "(?i)^Graph and Table of Infectious Diseases?$", + "^Presentation$", + "(?i)^Reviews and Information on Publications$", + "(?i)^PUBLIC HEALTH SERVICES?$", + "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$", + "(?i)^Adrese autora$", + "(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$", + "(?i)^Acknowledgement to Referees$", + "(?i)^Behçet's disease\\.?$", + "(?i)^Isolation and identification of restriction endonuclease.*$", + "(?i)^CEREBROVASCULAR DISEASES?.?$", + "(?i)^Screening for abdominal aortic aneurysms?\\.?$", + "^Event management$", + "(?i)^Breakfast and Crohn's disease.*\\.?$", + "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$", + "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$", + "^Gushi hakubutsugaku$", + "^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$", + "^Intestinal spirocha?etosis$", + "^Treatment of Rodent Ulcer$", + "(?i)^\\W*Cloud Computing\\W*$", + "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$", + "^Free Communications, Poster Presentations: Session [A-F]$", + "^“The Historical Aspects? of Quackery\\.?”$", + "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$", + "^P(er|re)-Mile Premiums for Auto Insurance\\.?$", + "(?i)^Case Report$", + "^Boletín Informativo$", + "(?i)^Glioblastoma Multiforme$", + "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$", + "^Zaměstnanecké výhody$", + "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$", + "(?i)^Carotid body tumours?\\.?$", + "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$", + "^Avant-propos$", + "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$", + "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$", + "(?i)^PUBLIC HEALTH VERSUS THE STATE$", + "^Viñetas de Cortázar$", + "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$", + "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$", + "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$", + "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$", + "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$", + "^Aus der AGMB$", + "^Znanstveno-stručni prilozi$", + "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$", + "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$", + "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$", + "^Finanční analýza podniku$", + "^Financial analysis( of business)?$", + "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$", + "^Jikken nihon shūshinsho$", + "(?i)^CORONER('|s)(s|') INQUESTS$", + "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$", + "(?i)^Consultants' contract(s)?$", + "(?i)^Upute autorima$", + "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$", + "^Joshi shin kokubun$", + "^Kōtō shōgaku dokuhon nōson'yō$", + "^Jinjō shōgaku shōka$", + "^Shōgaku shūjichō$", + "^Nihon joshi dokuhon$", + "^Joshi shin dokuhon$", + "^Chūtō kanbun dokuhon$", + "^Wabun dokuhon$", + "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$", + "(?i)^cardiac rehabilitation$", + "(?i)^Analytical summary$", + "^Thesaurus resolutionum Sacrae Congregationis Concilii$", + "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$", + "^Prikazi i osvrti$", + "^Rodinný dům s provozovnou$", + "^Family house with an establishment$", + "^Shinsei chūtō shin kokugun$", + "^Pulmonary alveolar proteinosis(\\.?)$", + "^Shinshū kanbun$", + "^Viñeta(s?) de Rodríguez$", + "(?i)^RUBRIKA UREDNIKA$", + "^A Matching Model of the Academic Publication Market$", + "^Yōgaku kōyō$", + "^Internetový marketing$", + "^Internet marketing$", + "^Chūtō kokugo dokuhon$", + "^Kokugo dokuhon$", + "^Antibiotic Cover for Dental Extraction(s?)$", + "^Strategie podniku$", + "^Strategy of an Enterprise$", + "(?i)^respiratory disease(s?)(\\.?)$", + "^Award(s?) for Gallantry in Civil Defence$", + "^Podniková kultura$", + "^Corporate Culture$", + "^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$", + "^Pracovní motivace$", + "^Work Motivation$", + "^Kaitei kōtō jogaku dokuhon$", + "^Konsolidovaná účetní závěrka$", + "^Consolidated Financial Statements$", + "(?i)^intracranial tumour(s?)$", + "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$", + "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$", + "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$", + "^\\[Funciones auxiliares de la música en Radio París,.*\\]$", + "^Úroveň motivačního procesu jako způsobu vedení lidí$", + "^The level of motivation process as a leadership$", + "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$", + "(?i)^news and events$", + "(?i)^NOVOSTI I DOGAĐAJI$", + "^Sansū no gakushū$", + "^Posouzení informačního systému firmy a návrh změn$", + "^Information System Assessment and Proposal for ICT Modification$", + "^Stresové zatížení pracovníků ve vybrané profesi$", + "^Stress load in a specific job$", + "^Sunday: Poster Sessions, Pt.*$", + "^Monday: Poster Sessions, Pt.*$", + "^Wednesday: Poster Sessions, Pt.*", + "^Tuesday: Poster Sessions, Pt.*$", + "^Analýza reklamy$", + "^Analysis of advertising$", + "^Shōgaku shūshinsho$", + "^Shōgaku sansū$", + "^Shintei joshi kokubun$", + "^Taishō joshi kokubun dokuhon$", + "^Joshi kokubun$", + "^Účetní uzávěrka a účetní závěrka v ČR$", + "(?i)^The \"?Causes\"? of Cancer$", + "^Normas para la publicación de artículos$", + "^Editor('|s)(s|') [Rr]eply$", + "^Editor(’|s)(s|’) letter$", + "^Redaktoriaus žodis$", + "^DISCUSSION ON THE PRECEDING PAPER$", + "^Kōtō shōgaku shūshinsho jidōyō$", + "^Shōgaku nihon rekishi$", + "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$", + "^Préface$", + "^Occupational [Hh]ealth [Ss]ervices.$", + "^In Memoriam Professor Toshiyuki TAKESHIMA$", + "^Účetní závěrka ve vybraném podniku.*$", + "^Financial statements in selected company$", + "^Abdominal [Aa]ortic [Aa]neurysms.*$", + "^Pseudomyxoma peritonei$", + "^Kazalo autora$", + "(?i)^uvodna riječ$", + "^Motivace jako způsob vedení lidí$", + "^Motivation as a leadership$", + "^Polyfunkční dům$", + "^Multi\\-funkcional building$", + "^Podnikatelský plán$", + "(?i)^Podnikatelský záměr$", + "(?i)^Business Plan$", + "^Oceňování nemovitostí$", + "^Marketingová komunikace$", + "^Marketing communication$", + "^Sumario Analítico$", + "^Riječ uredništva$", + "^Savjetovanja i priredbe$", + "^Índice$", + "^(Starobosanski nadpisi).*$", + "^Vzdělávání pracovníků v organizaci$", + "^Staff training in organization$", + "^(Life Histories of North American Geometridae).*$", + "^Strategická analýza podniku$", + "^Strategic Analysis of an Enterprise$", + "^Sadržaj$", + "^Upute suradnicima$", + "^Rodinný dům$", + "(?i)^Fami(l)?ly house$", + "^Upute autorima$", + "^Strategic Analysis$", + "^Finanční analýza vybraného podniku$", + "^Finanční analýza$", + "^Riječ urednika$", + "(?i)^Content(s?)$", + "(?i)^Inhalt$", + "^Jinjō shōgaku shūshinsho jidōyō$", + "(?i)^Index$", + "^Chūgaku kokubun kyōkasho$", + "^Retrato de una mujer$", + "^Retrato de un hombre$", + "^Kōtō shōgaku dokuhon$", + "^Shotōka kokugo$", + "^Shōgaku dokuhon$", + "^Jinjō shōgaku kokugo dokuhon$", + "^Shinsei kokugo dokuhon$", + "^Teikoku dokuhon$", + "^Instructions to Authors$", + "^KİTAP TAHLİLİ$", + "^PRZEGLĄD PIŚMIENNICTWA$", + "(?i)^Presentación$", + "^İçindekiler$", + "(?i)^Tabl?e of contents$", + "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$", + "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*", + "^Editorial( Board)?$", + "(?i)^Editorial \\(English\\)$", + "^Editörden$", + "^(Corpus Oral Dialectal \\(COD\\)\\.).*$", + "^(Kiri Karl Morgensternile).*$", + "^(\\[Eksliibris Aleksandr).*\\]$", + "^(\\[Eksliibris Aleksandr).*$", + "^(Eksliibris Aleksandr).*$", + "^(Kiri A\\. de Vignolles).*$", + "^(2 kirja Karl Morgensternile).*$", + "^(Pirita kloostri idaosa arheoloogilised).*$", + "^(Kiri tundmatule).*$", + "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", + "^(Eksliibris Nikolai Birukovile).*$", + "^(Eksliibris Nikolai Issakovile).*$", + "^(WHP Cruise Summary Information of section).*$", + "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", + "^(Measurement of the spin\\-dependent structure function).*", + "(?i)^.*authors['’′]? reply\\.?$", + "(?i)^.*authors['’′]? response\\.?$" + ] + }, + "synonyms": {} + } +} \ No newline at end of file diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.example.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.example.json new file mode 100644 index 000000000..e15cdc524 --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.example.json @@ -0,0 +1 @@ +{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "pid": [], "oaiprovenance": {"originDescription": {"metadataNamespace": "", "harvestDate": "2021-06-10T10:03:36.091Z", "baseURL": "file%3A%2F%2F%2Fvar%2Flib%2Fdnet%2Fdata%2Fsygma%2Fnew_ingestion%2Fcrossref", "datestamp": "", "altered": true, "identifier": ""}}, "relevantdate": [], "contributor": [], "id": "50|sygma_______::3bbb03e6ec8df0d219b2d2165ea1d446", "subject": [], "lastupdatetimestamp": 1628684944004, "author": [{"surname": "Pan", "fullname": "Pan, Mengwu", "pid": [], "name": "Mengwu", "rank": 1}, {"surname": "Blattner", "fullname": "Blattner, Christine", "pid": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "orcid_pending", "classname": "Open Researcher and Contributor ID", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "0000-0002-7250-5273"}], "name": "Christine", "rank": 2}], "collectedfrom": [{"value": "Sygma", "key": "10|openaire____::a8db6f6b2ce4fe72e8b2314a9a93e7d9"}], "instance": [{"refereed": {"classid": "UNKNOWN", "classname": "Unknown", "schemename": "dnet:review_levels", "schemeid": "dnet:review_levels"}, "hostedby": {"value": "Cancers", "key": "10|issn__online::69ba871b903253074dcf4054e619afff"}, "license": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "value": "https://creativecommons.org/licenses/by/4.0/"}, "url": ["http://dx.doi.org/10.3390/cancers13040745"], "pid": [], "distributionlocation": "", "alternateIdentifier": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.3390/cancers13040745"}], "collectedfrom": {"value": "Sygma", "key": "10|openaire____::a8db6f6b2ce4fe72e8b2314a9a93e7d9"}, "accessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0001", "classname": "Article", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}}], "resulttype": {"classid": "publication", "classname": "publication", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "dateofcollection": "2021-06-10T10:03:36.091Z", "fulltext": [], "dateoftransformation": "2021-07-20T16:59:21.682Z", "description": [], "format": [], "journal": {"issnPrinted": "", "vol": "13", "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "name": "Cancers", "iss": "4", "sp": "745", "edition": "", "issnOnline": "2072-6694", "ep": "", "issnLinking": ""}, "coverage": [], "externalReference": [], "language": {"classid": "eng", "classname": "English", "schemename": "dnet:languages", "schemeid": "dnet:languages"}, "bestaccessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "country": [], "extraInfo": [], "originalId": ["10.3390/cancers13040745", "50|sygma_______::3bbb03e6ec8df0d219b2d2165ea1d446"], "source": [], "context": [], "title": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Regulation of p53 by E3s"}]} \ No newline at end of file diff --git a/release.properties b/release.properties deleted file mode 100644 index f8c96de45..000000000 --- a/release.properties +++ /dev/null @@ -1,11 +0,0 @@ -#release configuration -#Tue Sep 29 12:04:49 CEST 2020 -scm.tagNameFormat=@{project.artifactId}-@{project.version} -pushChanges=true -scm.url=scm\:git\:https\://code-repo.d4science.org/D-Net/dnet-dedup.git -preparationGoals=clean verify -projectVersionPolicyId=default -remoteTagging=true -scm.commentPrefix=[maven-release-plugin] -exec.snapshotReleasePluginAllowed=false -completedPhase=check-poms