From bb5b845e3cd0116a3df8bd8c70d7d8ebabfd8b3c Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 17 Jul 2023 17:18:46 +0200 Subject: [PATCH 1/2] Use scala.binary.version property to resolve scala maven dependencies Ensure consistent usage of maven properties Profile for compiling with scala 2.12 and Spark 3.4 --- dhp-common/pom.xml | 8 +-- dhp-pace-core/pom.xml | 17 +++--- .../eu/dnetlib/pace/util/DiffPatchMatch.java | 18 +++++++ dhp-workflows/dhp-actionmanager/pom.xml | 4 +- dhp-workflows/dhp-aggregation/pom.xml | 11 +++- dhp-workflows/dhp-blacklist/pom.xml | 4 +- dhp-workflows/dhp-broker-events/pom.xml | 4 +- dhp-workflows/dhp-dedup-openaire/pom.xml | 21 +++++--- dhp-workflows/dhp-doiboost/pom.xml | 6 ++- dhp-workflows/dhp-enrichment/pom.xml | 6 +-- dhp-workflows/dhp-graph-mapper/pom.xml | 12 +++-- dhp-workflows/dhp-graph-provision/pom.xml | 8 +-- dhp-workflows/dhp-stats-promote/pom.xml | 4 +- dhp-workflows/dhp-stats-update/pom.xml | 4 +- .../dhp-usage-raw-data-update/pom.xml | 6 +-- dhp-workflows/dhp-usage-stats-build/pom.xml | 6 +-- pom.xml | 53 +++++++++++++++---- 17 files changed, 133 insertions(+), 59 deletions(-) diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index a7a83821e..6198bd81e 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -52,6 +52,8 @@ + true + ${scala.binary.version} ${scala.version} @@ -81,11 +83,11 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} @@ -159,7 +161,7 @@ eu.dnetlib.dhp - dhp-schemas + ${dhp-schemas.artifact} diff --git a/dhp-pace-core/pom.xml b/dhp-pace-core/pom.xml index 12174a5c5..fd7f44fc9 100644 --- a/dhp-pace-core/pom.xml +++ b/dhp-pace-core/pom.xml @@ -20,7 +20,7 @@ net.alchim31.maven scala-maven-plugin - 4.0.1 + ${net.alchim31.maven.version} scala-compile-first @@ -39,8 +39,9 @@ + true + ${scala.binary.version} ${scala.version} - -target:jvm-1.8 @@ -68,7 +69,6 @@ commons-io commons-io - org.antlr stringtemplate @@ -89,17 +89,22 @@ org.apache.commons commons-math3 - com.jayway.jsonpath json-path - com.ibm.icu icu4j - + + org.apache.spark + spark-core_${scala.binary.version} + + + org.apache.spark + spark-sql_${scala.binary.version} + diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java index 84d49bd5c..12c96500e 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java @@ -1,6 +1,24 @@ package eu.dnetlib.pace.util; +/* + * Diff Match and Patch + * Copyright 2018 The diff-match-patch Authors. + * https://github.com/google/diff-match-patch + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + /* * Diff Match and Patch * Copyright 2018 The diff-match-patch Authors. diff --git a/dhp-workflows/dhp-actionmanager/pom.xml b/dhp-workflows/dhp-actionmanager/pom.xml index 29e1fab1f..ce13502b6 100644 --- a/dhp-workflows/dhp-actionmanager/pom.xml +++ b/dhp-workflows/dhp-actionmanager/pom.xml @@ -11,12 +11,12 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index 53d349d2a..108d25ba6 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -38,6 +38,8 @@ + true + ${scala.binary.version} ${scala.version} @@ -54,11 +56,11 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} @@ -75,6 +77,11 @@ dom4j dom4j + + org.scala-lang.modules + scala-xml_${scala.binary.version} + ${scala-xml.version} + xml-apis diff --git a/dhp-workflows/dhp-blacklist/pom.xml b/dhp-workflows/dhp-blacklist/pom.xml index 479a9e8c6..7ecc8b35d 100644 --- a/dhp-workflows/dhp-blacklist/pom.xml +++ b/dhp-workflows/dhp-blacklist/pom.xml @@ -16,11 +16,11 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} diff --git a/dhp-workflows/dhp-broker-events/pom.xml b/dhp-workflows/dhp-broker-events/pom.xml index 01f1ea321..322fc7e93 100644 --- a/dhp-workflows/dhp-broker-events/pom.xml +++ b/dhp-workflows/dhp-broker-events/pom.xml @@ -18,11 +18,11 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} org.elasticsearch diff --git a/dhp-workflows/dhp-dedup-openaire/pom.xml b/dhp-workflows/dhp-dedup-openaire/pom.xml index af4511c3c..a271efe8e 100644 --- a/dhp-workflows/dhp-dedup-openaire/pom.xml +++ b/dhp-workflows/dhp-dedup-openaire/pom.xml @@ -13,7 +13,7 @@ net.alchim31.maven scala-maven-plugin - 4.0.1 + ${net.alchim31.maven.version} scala-compile-first @@ -32,6 +32,8 @@ + true + ${scala.binary.version} ${scala.version} @@ -53,30 +55,35 @@ ${project.version} + + org.apache.commons + commons-lang3 + + org.scala-lang.modules - scala-java8-compat_2.11 + scala-java8-compat_${scala.binary.version} 1.0.2 org.scala-lang.modules - scala-collection-compat_2.11 - 2.8.0 + scala-collection-compat_${scala.binary.version} + 2.11.0 org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} org.apache.spark - spark-graphx_2.11 + spark-graphx_${scala.binary.version} diff --git a/dhp-workflows/dhp-doiboost/pom.xml b/dhp-workflows/dhp-doiboost/pom.xml index 37accbc4f..6e8911fba 100644 --- a/dhp-workflows/dhp-doiboost/pom.xml +++ b/dhp-workflows/dhp-doiboost/pom.xml @@ -33,6 +33,8 @@ + true + ${scala.binary.version} ${scala.version} @@ -70,12 +72,12 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} diff --git a/dhp-workflows/dhp-enrichment/pom.xml b/dhp-workflows/dhp-enrichment/pom.xml index 591cad252..9698dee03 100644 --- a/dhp-workflows/dhp-enrichment/pom.xml +++ b/dhp-workflows/dhp-enrichment/pom.xml @@ -12,11 +12,11 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} @@ -27,7 +27,7 @@ org.apache.spark - spark-hive_2.11 + spark-hive_${scala.binary.version} test diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml index f579a7d2b..ef35951c0 100644 --- a/dhp-workflows/dhp-graph-mapper/pom.xml +++ b/dhp-workflows/dhp-graph-mapper/pom.xml @@ -14,7 +14,7 @@ net.alchim31.maven scala-maven-plugin - 4.0.1 + ${net.alchim31.maven.version} scala-compile-first @@ -37,6 +37,8 @@ -Xmax-classfile-name 200 + true + ${scala.binary.version} ${scala.version} @@ -64,15 +66,15 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} org.apache.spark - spark-hive_2.11 + spark-hive_${scala.binary.version} test @@ -125,7 +127,7 @@ org.json4s - json4s-jackson_2.11 + json4s-jackson_${scala.binary.version} diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml index 413cc8cdd..e62fcdf19 100644 --- a/dhp-workflows/dhp-graph-provision/pom.xml +++ b/dhp-workflows/dhp-graph-provision/pom.xml @@ -14,7 +14,7 @@ net.alchim31.maven scala-maven-plugin - 4.0.1 + ${net.alchim31.maven.version} scala-compile-first @@ -37,6 +37,8 @@ -Xmax-classfile-name 200 + true + ${scala.binary.version} ${scala.version} @@ -48,11 +50,11 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} com.jayway.jsonpath diff --git a/dhp-workflows/dhp-stats-promote/pom.xml b/dhp-workflows/dhp-stats-promote/pom.xml index ce3e739a5..9e17a78dc 100644 --- a/dhp-workflows/dhp-stats-promote/pom.xml +++ b/dhp-workflows/dhp-stats-promote/pom.xml @@ -10,11 +10,11 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} diff --git a/dhp-workflows/dhp-stats-update/pom.xml b/dhp-workflows/dhp-stats-update/pom.xml index 2bc610c42..f491b5868 100644 --- a/dhp-workflows/dhp-stats-update/pom.xml +++ b/dhp-workflows/dhp-stats-update/pom.xml @@ -10,11 +10,11 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} diff --git a/dhp-workflows/dhp-usage-raw-data-update/pom.xml b/dhp-workflows/dhp-usage-raw-data-update/pom.xml index 954c8bd39..a9dbb09ae 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/pom.xml +++ b/dhp-workflows/dhp-usage-raw-data-update/pom.xml @@ -46,13 +46,11 @@ org.apache.spark - spark-core_2.11 - 2.2.0 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 - 2.4.5 + spark-sql_${scala.binary.version} com.googlecode.json-simple diff --git a/dhp-workflows/dhp-usage-stats-build/pom.xml b/dhp-workflows/dhp-usage-stats-build/pom.xml index 54e18580b..56aec73b7 100644 --- a/dhp-workflows/dhp-usage-stats-build/pom.xml +++ b/dhp-workflows/dhp-usage-stats-build/pom.xml @@ -46,13 +46,11 @@ org.apache.spark - spark-core_2.11 - 2.2.0 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 - 2.4.5 + spark-sql_${scala.binary.version} com.googlecode.json-simple diff --git a/pom.xml b/pom.xml index 0f365a79b..c6b65e27a 100644 --- a/pom.xml +++ b/pom.xml @@ -142,7 +142,7 @@ eu.dnetlib.dhp - dhp-schemas + ${dhp-schemas.artifact} ${dhp-schemas.version} @@ -171,25 +171,25 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} ${dhp.spark.version} provided org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} ${dhp.spark.version} provided org.apache.spark - spark-graphx_2.11 + spark-graphx_${scala.binary.version} ${dhp.spark.version} provided org.apache.spark - spark-hive_2.11 + spark-hive_${scala.binary.version} ${dhp.spark.version} test @@ -295,7 +295,7 @@ com.lucidworks.spark spark-solr - 3.6.0 + ${sparksolr.version} * @@ -518,7 +518,7 @@ org.json4s - json4s-jackson_2.11 + json4s-jackson_${scala.binary.version} ${json4s.version} @@ -610,7 +610,11 @@ target/test-classes - + + org.apache.maven.plugins + maven-plugin-plugin + 3.3 + org.apache.maven.plugins maven-project-info-reports-plugin @@ -694,7 +698,7 @@ org.antipathy - mvn-scalafmt_2.11 + mvn-scalafmt_${scala.binary.version} 1.0.1640073709.733712b @@ -751,7 +755,7 @@ org.antipathy - mvn-scalafmt_2.11 + mvn-scalafmt_${scala.binary.version} https://code-repo.d4science.org/D-Net/dnet-hadoop/raw/branch/beta/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf false @@ -860,12 +864,16 @@ cdh5.9.2 2.6.0-${dhp.cdh.version} 4.1.0-${dhp.cdh.version} + dhp-schemas + 3.6.0 2.4.0.cloudera2 2.9.6 3.5 true 11.0.2 2.11.12 + 2.11 + 1.3.0 5.6.1 3.3.3 3.4.2 @@ -889,4 +897,29 @@ 1.1.3 3.2.1 + + + + + scala-2.12 + + 2.12 + 2.12.18 + + + + 4.0.2 + 3.4.1 + 2.14.2 + 3.12.0 + 3.7.0-M11 + 4.8.1 + + + + + \ No newline at end of file From e64c2854a369ae9787fadd210be3ac700b64b996 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Tue, 18 Jul 2023 11:38:56 +0200 Subject: [PATCH 2/2] Refactor Dedup process to use Spark Dataframe API and intermediate representation with Row interface JsonPath cache contention fixed by using a ConcurrentHashMap Blacklist filtering performance improvement Minor performance improvements when evaluating similarity Sorting in clustered elements is deterministic (by ordering and identity field, instead of ordering field only) --- .../AbstractClusteringFunction.java | 4 +- .../BlacklistAwareClusteringCombiner.java | 60 ---- .../pace/clustering/ClusteringCombiner.java | 64 ---- .../pace/clustering/ClusteringFunction.java | 3 +- .../pace/clustering/KeywordsClustering.java | 5 +- .../pace/clustering/LastNameFirstInitial.java | 4 +- .../pace/clustering/LowercaseClustering.java | 7 +- .../dnetlib/pace/clustering/NGramUtils.java | 6 +- .../dnetlib/pace/clustering/NgramPairs.java | 7 +- .../eu/dnetlib/pace/clustering/Ngrams.java | 20 +- .../pace/clustering/PersonClustering.java | 11 +- .../pace/clustering/SortedNgramPairs.java | 2 +- .../pace/clustering/UrlClustering.java | 4 +- .../pace/common/AbstractPaceFunctions.java | 24 +- .../java/eu/dnetlib/pace/config/Config.java | 11 +- .../eu/dnetlib/pace/config/DedupConfig.java | 37 +- .../eu/dnetlib/pace/config/PaceConfig.java | 13 +- .../eu/dnetlib/pace/model/AbstractField.java | 72 ---- .../java/eu/dnetlib/pace/model/Document.java | 40 --- .../java/eu/dnetlib/pace/model/Field.java | 57 ---- .../java/eu/dnetlib/pace/model/FieldDef.java | 14 - .../java/eu/dnetlib/pace/model/FieldList.java | 25 -- .../eu/dnetlib/pace/model/FieldListImpl.java | 315 ------------------ .../eu/dnetlib/pace/model/FieldValue.java | 26 -- .../eu/dnetlib/pace/model/FieldValueImpl.java | 135 -------- .../eu/dnetlib/pace/model/MapDocument.java | 143 -------- .../pace/model/MapDocumentComparator.java | 52 --- .../pace/model/MapDocumentSerializer.java | 103 ------ .../pace/model/RowDataOrderingComparator.java | 65 ++++ .../eu/dnetlib/pace/model/SparkDeduper.scala | 131 ++++++++ .../eu/dnetlib/pace/model/SparkModel.scala | 108 ++++++ .../eu/dnetlib/pace/tree/AlwaysMatch.java | 5 +- .../eu/dnetlib/pace/tree/AuthorsMatch.java | 26 +- .../java/eu/dnetlib/pace/tree/CityMatch.java | 4 +- .../dnetlib/pace/tree/CosineSimilarity.java | 22 +- .../eu/dnetlib/pace/tree/DoiExactMatch.java | 5 +- .../dnetlib/pace/tree/DomainExactMatch.java | 5 +- .../java/eu/dnetlib/pace/tree/ExactMatch.java | 4 +- .../pace/tree/ExactMatchIgnoreCase.java | 18 +- .../dnetlib/pace/tree/InstanceTypeMatch.java | 17 +- .../eu/dnetlib/pace/tree/JaroWinkler.java | 4 +- .../pace/tree/JaroWinklerNormalizedName.java | 4 +- .../dnetlib/pace/tree/JaroWinklerTitle.java | 4 +- .../eu/dnetlib/pace/tree/JsonListMatch.java | 25 +- .../eu/dnetlib/pace/tree/KeywordMatch.java | 4 +- .../dnetlib/pace/tree/Level2JaroWinkler.java | 4 +- .../pace/tree/Level2JaroWinklerTitle.java | 4 +- .../dnetlib/pace/tree/Level2Levenstein.java | 4 +- .../java/eu/dnetlib/pace/tree/Levenstein.java | 4 +- .../eu/dnetlib/pace/tree/LevensteinTitle.java | 4 +- .../tree/LevensteinTitleIgnoreVersion.java | 4 +- .../dnetlib/pace/tree/ListContainsMatch.java | 15 +- .../eu/dnetlib/pace/tree/MustBeDifferent.java | 4 +- .../dnetlib/pace/tree/NullDistanceAlgo.java | 5 +- .../dnetlib/pace/tree/NumbersComparator.java | 4 +- .../eu/dnetlib/pace/tree/NumbersMatch.java | 4 +- .../eu/dnetlib/pace/tree/RomansMatch.java | 4 +- .../java/eu/dnetlib/pace/tree/SizeMatch.java | 24 +- .../pace/tree/StringContainsMatch.java | 4 +- .../eu/dnetlib/pace/tree/StringListMatch.java | 13 +- .../pace/tree/SubStringLevenstein.java | 41 +-- .../dnetlib/pace/tree/TitleVersionMatch.java | 14 +- .../java/eu/dnetlib/pace/tree/UrlMatcher.java | 10 +- .../java/eu/dnetlib/pace/tree/YearMatch.java | 11 +- .../pace/tree/support/AbstractComparator.java | 80 ++--- .../tree/support/AbstractListComparator.java | 39 +++ .../support/AbstractSortedComparator.java | 20 +- .../support/AbstractStringComparator.java | 46 +++ .../dnetlib/pace/tree/support/Comparator.java | 6 +- .../dnetlib/pace/tree/support/FieldStats.java | 15 +- .../dnetlib/pace/tree/support/MatchType.java | 17 +- .../pace/tree/support/TreeNodeDef.java | 48 +-- .../pace/tree/support/TreeProcessor.java | 48 +-- .../eu/dnetlib/pace/util/BlockProcessor.java | 157 +++------ .../pace/util/BlockProcessorForTesting.java | 276 --------------- .../eu/dnetlib/pace/util/DiffPatchMatch.java | 17 - .../eu/dnetlib/pace/util/MapDocumentUtil.java | 170 ++++------ .../eu/dnetlib/pace/util/SparkReporter.java | 85 +++++ .../eu/dnetlib/pace/AbstractPaceTest.java | 39 +-- .../clustering/ClusteringFunctionTest.java | 68 ++-- .../pace/comparators/ComparatorTest.java | 60 ++-- .../eu/dnetlib/pace/config/ConfigTest.java | 104 +----- .../java/eu/dnetlib/pace/util/UtilTest.java | 5 +- .../project/utils/EXCELParser.java | 4 +- .../project/utils/ReadProjects.java | 9 +- .../project/utils/ReadTopics.java | 4 +- .../collection/plugin/oai/OaiIterator.java | 2 +- .../dhp/broker/oa/CheckDuplictedIdsJob.java | 2 +- .../dhp/broker/oa/util/TrustUtils.java | 18 +- .../aggregators/stats/StatsAggregator.java | 2 +- .../oa/samples/SimpleVariableJobTest.java | 9 +- .../dhp/oa/dedup/AbstractSparkAction.java | 4 +- .../eu/dnetlib/dhp/oa/dedup/DatePicker.java | 6 +- .../eu/dnetlib/dhp/oa/dedup/DedupUtility.java | 46 +-- .../java/eu/dnetlib/dhp/oa/dedup/Deduper.java | 58 ---- .../dnetlib/dhp/oa/dedup/SparkBlockStats.java | 61 ++-- .../dhp/oa/dedup/SparkCreateSimRels.java | 45 +-- .../dnetlib/dhp/oa/dedup/SparkReporter.java | 50 --- .../dhp/oa/dedup/SparkWhitelistSimRels.java | 66 ++-- .../oa/dedup/graph/ConnectedComponent.java | 2 +- .../eu/dnetlib/dhp/oa/dedup/model/Block.java | 80 ----- .../dhp/oa/dedup/EntityMergerTest.java | 8 +- .../dnetlib/dhp/oa/dedup/SparkDedupTest.java | 32 +- .../dhp/oa/dedup/SparkOpenorgsDedupTest.java | 4 +- .../oa/dedup/SparkOpenorgsProvisionTest.java | 4 - .../oa/dedup/SparkPublicationRootsTest.java | 2 +- .../dhp/oa/dedup/jpath/JsonPathTest.java | 300 +---------------- .../dedup/jpath/dedup_conf_organization.json | 269 +++++++++++++++ .../dhp/oa/dedup/jpath/organization.json | 241 ++++++++++++++ .../OrcidPropagationJobTest.java | 2 - .../dhp/sx/graph/SparkCreateInputGraph.scala | 2 +- .../dhp/oa/provision/XmlIndexingJob.java | 5 + .../oa/provision/utils/XmlRecordFactory.java | 1 - 113 files changed, 1644 insertions(+), 2886 deletions(-) delete mode 100644 dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java delete mode 100644 dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java delete mode 100644 dhp-pace-core/src/main/java/eu/dnetlib/pace/model/AbstractField.java delete mode 100644 dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Document.java delete mode 100644 dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Field.java delete mode 100644 dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldList.java delete mode 100644 dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java delete mode 100644 dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java delete mode 100644 dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java delete mode 100644 dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocument.java delete mode 100644 dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentComparator.java delete mode 100644 dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java create mode 100644 dhp-pace-core/src/main/java/eu/dnetlib/pace/model/RowDataOrderingComparator.java create mode 100644 dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDeduper.scala create mode 100644 dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala create mode 100644 dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractListComparator.java create mode 100644 dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractStringComparator.java delete mode 100644 dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java create mode 100644 dhp-pace-core/src/main/java/eu/dnetlib/pace/util/SparkReporter.java delete mode 100644 dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java delete mode 100644 dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkReporter.java delete mode 100644 dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Block.java create mode 100644 dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/oa/dedup/jpath/dedup_conf_organization.json create mode 100644 dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/oa/dedup/jpath/organization.json diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java index e984f5d18..3da8eb490 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java @@ -11,7 +11,6 @@ import org.apache.commons.lang3.StringUtils; import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction { @@ -24,11 +23,10 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i protected abstract Collection doApply(Config conf, String s); @Override - public Collection apply(Config conf, List fields) { + public Collection apply(Config conf, List fields) { return fields .stream() .filter(f -> !f.isEmpty()) - .map(Field::stringValue) .map(this::normalize) .map(s -> filterAllStopWords(s)) .map(s -> doApply(conf, s)) diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java deleted file mode 100644 index f0e93b8ba..000000000 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java +++ /dev/null @@ -1,60 +0,0 @@ - -package eu.dnetlib.pace.clustering; - -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.regex.Pattern; - -import com.google.common.collect.Maps; - -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Document; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldListImpl; -import eu.dnetlib.pace.model.MapDocument; - -public class BlacklistAwareClusteringCombiner extends ClusteringCombiner { - - public static Collection filterAndCombine(final MapDocument a, final Config conf) { - Document filtered = filter(a, conf.blacklists()); - return combine(filtered, conf); - } - - private static MapDocument filter(final MapDocument a, final Map> blacklists) { - if (blacklists == null || blacklists.isEmpty()) { - return a; - } - - final Map filtered = Maps.newHashMap(a.getFieldMap()); - - for (final Entry> e : blacklists.entrySet()) { - Field fields = a.getFieldMap().get(e.getKey()); - if (fields != null) { - final FieldListImpl fl = new FieldListImpl(); - - for (Field f : fields) { - if (!isBlackListed(f.stringValue(), e.getValue())) { - fl.add(f); - } - } - - filtered.put(e.getKey(), fl); - } - } - - return new MapDocument(a.getIdentifier(), filtered); - } - - private static boolean isBlackListed(String value, List blacklist) { - for (Pattern pattern : blacklist) { - if (pattern.matcher(value).matches()) { - return true; - } - } - - return false; - } - -} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java deleted file mode 100644 index 3a6f17e20..000000000 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java +++ /dev/null @@ -1,64 +0,0 @@ - -package eu.dnetlib.pace.clustering; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; -import java.util.stream.Collectors; - -import org.apache.commons.lang3.StringUtils; - -import com.google.common.collect.Sets; - -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.ClusteringDef; -import eu.dnetlib.pace.model.Document; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldValueImpl; - -public class ClusteringCombiner { - - private static String SEPARATOR = ":"; - private static String COLLAPSE_ON = "collapseOn"; - - public static Collection combine(final Document a, final Config conf) { - final Collection res = Sets.newLinkedHashSet(); - for (final ClusteringDef cd : conf.clusterings()) { - for (final String fieldName : cd.getFields()) { - String prefix = getPrefix(cd, fieldName); - - Field values = a.values(fieldName); - List fields = new ArrayList<>(); - - if (values instanceof FieldValueImpl) { - fields.add(values); - } else { - fields.addAll((List) values); - } - - res - .addAll( - cd - .clusteringFunction() - .apply(conf, fields) - .stream() - .map(k -> prefix + SEPARATOR + k) - .collect(Collectors.toList())); - } - } - return res; - } - - private static String getPrefix(ClusteringDef cd, String fieldName) { - return cd.getName() + SEPARATOR + - cd - .getParams() - .keySet() - .stream() - .filter(k -> k.contains(COLLAPSE_ON)) - .findFirst() - .map(k -> StringUtils.substringAfter(k, SEPARATOR)) - .orElse(fieldName); - } - -} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java index e72535160..8b7852418 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java @@ -6,11 +6,10 @@ import java.util.List; import java.util.Map; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; public interface ClusteringFunction { - public Collection apply(Config config, List fields); + public Collection apply(Config config, List fields); public Map getParams(); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java index 60861aafd..38299adb4 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java @@ -6,9 +6,7 @@ import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; -import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; @ClusteringClass("keywordsclustering") public class KeywordsClustering extends AbstractClusteringFunction { @@ -40,11 +38,10 @@ public class KeywordsClustering extends AbstractClusteringFunction { } @Override - public Collection apply(final Config conf, List fields) { + public Collection apply(final Config conf, List fields) { return fields .stream() .filter(f -> !f.isEmpty()) - .map(Field::stringValue) .map(this::cleanup) .map(this::normalize) .map(s -> filterAllStopWords(s)) diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java index dc6f8f775..5a385961a 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java @@ -9,7 +9,6 @@ import org.apache.commons.lang3.StringUtils; import com.google.common.collect.Lists; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Person; @ClusteringClass("lnfi") @@ -22,11 +21,10 @@ public class LastNameFirstInitial extends AbstractClusteringFunction { } @Override - public Collection apply(Config conf, List fields) { + public Collection apply(Config conf, List fields) { return fields .stream() .filter(f -> !f.isEmpty()) - .map(Field::stringValue) .map(this::normalize) .map(s -> doApply(conf, s)) .map(c -> filterBlacklisted(c, ngramBlacklist)) diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java index 403d187fa..a3a6c4881 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java @@ -11,7 +11,6 @@ import com.google.common.collect.Lists; import com.google.common.collect.Sets; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; @ClusteringClass("lowercase") public class LowercaseClustering extends AbstractClusteringFunction { @@ -21,10 +20,10 @@ public class LowercaseClustering extends AbstractClusteringFunction { } @Override - public Collection apply(Config conf, List fields) { + public Collection apply(Config conf, List fields) { Collection c = Sets.newLinkedHashSet(); - for (Field f : fields) { - c.addAll(doApply(conf, f.stringValue())); + for (String f : fields) { + c.addAll(doApply(conf, f)); } return c; } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java index aa12f1279..6ee80b86e 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java @@ -8,15 +8,15 @@ import org.apache.commons.lang3.StringUtils; import eu.dnetlib.pace.common.AbstractPaceFunctions; public class NGramUtils extends AbstractPaceFunctions { + static private final NGramUtils NGRAMUTILS = new NGramUtils(); private static final int SIZE = 100; - private static Set stopwords = AbstractPaceFunctions + private static final Set stopwords = AbstractPaceFunctions .loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); public static String cleanupForOrdering(String s) { - NGramUtils utils = new NGramUtils(); - return (utils.filterStopWords(utils.normalize(s), stopwords) + StringUtils.repeat(" ", SIZE)) + return (NGRAMUTILS.filterStopWords(NGRAMUTILS.normalize(s), stopwords) + StringUtils.repeat(" ", SIZE)) .substring(0, SIZE) .replaceAll(" ", ""); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java index 0656312c7..aa06aa408 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java @@ -2,7 +2,6 @@ package eu.dnetlib.pace.clustering; import java.util.Collection; -import java.util.HashMap; import java.util.List; import java.util.Map; @@ -14,7 +13,11 @@ import eu.dnetlib.pace.config.Config; public class NgramPairs extends Ngrams { public NgramPairs(Map params) { - super(params); + super(params, false); + } + + public NgramPairs(Map params, boolean sorted) { + super(params, sorted); } @Override diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java index bcc10a869..96c305a16 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java @@ -8,8 +8,15 @@ import eu.dnetlib.pace.config.Config; @ClusteringClass("ngrams") public class Ngrams extends AbstractClusteringFunction { + private final boolean sorted; + public Ngrams(Map params) { + this(params, false); + } + + public Ngrams(Map params, boolean sorted) { super(params); + this.sorted = sorted; } @Override @@ -19,20 +26,21 @@ public class Ngrams extends AbstractClusteringFunction { protected Collection getNgrams(String s, int ngramLen, int max, int maxPerToken, int minNgramLen) { - final Collection ngrams = new LinkedHashSet(); + final Collection ngrams = sorted ? new TreeSet<>() : new LinkedHashSet(); final StringTokenizer st = new StringTokenizer(s); while (st.hasMoreTokens()) { final String token = st.nextToken(); if (!token.isEmpty()) { - for (int i = 0; i < maxPerToken && ngramLen + i <= token.length(); i++) { - String ngram = (token + " ").substring(i, ngramLen + i).trim(); - if (ngrams.size() >= max) { - return ngrams; - } + String ngram = token.substring(i, Math.min(ngramLen + i, token.length())).trim(); + if (ngram.length() >= minNgramLen) { ngrams.add(ngram); + + if (ngrams.size() >= max) { + return ngrams; + } } } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java index 83b92f22c..b4a04ce65 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java @@ -12,7 +12,6 @@ import com.google.common.collect.Sets; import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Person; @ClusteringClass("personClustering") @@ -27,19 +26,19 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin } @Override - public Collection apply(final Config conf, final List fields) { + public Collection apply(final Config conf, final List fields) { final Set hashes = Sets.newHashSet(); - for (final Field f : fields) { + for (final String f : fields) { - final Person person = new Person(f.stringValue(), false); + final Person person = new Person(f, false); if (StringUtils.isNotBlank(person.getNormalisedFirstName()) && StringUtils.isNotBlank(person.getNormalisedSurname())) { hashes.add(firstLC(person.getNormalisedFirstName()) + person.getNormalisedSurname().toLowerCase()); } else { - for (final String token1 : tokens(f.stringValue(), MAX_TOKENS)) { - for (final String token2 : tokens(f.stringValue(), MAX_TOKENS)) { + for (final String token1 : tokens(f, MAX_TOKENS)) { + for (final String token2 : tokens(f, MAX_TOKENS)) { if (!token1.equals(token2)) { hashes.add(firstLC(token1) + token2); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java index 1fc9f1747..b085ae26d 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java @@ -13,7 +13,7 @@ import eu.dnetlib.pace.config.Config; public class SortedNgramPairs extends NgramPairs { public SortedNgramPairs(Map params) { - super(params); + super(params, false); } @Override diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java index 122e01179..5b267ad10 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java @@ -11,7 +11,6 @@ import java.util.stream.Collectors; import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; @ClusteringClass("urlclustering") public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction { @@ -23,12 +22,11 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu } @Override - public Collection apply(final Config conf, List fields) { + public Collection apply(final Config conf, List fields) { try { return fields .stream() .filter(f -> !f.isEmpty()) - .map(Field::stringValue) .map(this::asUrl) .map(URL::getHost) .collect(Collectors.toCollection(HashSet::new)); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index 9902508b8..b440686de 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -16,13 +16,11 @@ import org.apache.commons.lang3.StringUtils; import com.google.common.base.Joiner; import com.google.common.base.Splitter; import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; import com.google.common.collect.Sets; import com.ibm.icu.text.Transliterator; import eu.dnetlib.pace.clustering.NGramUtils; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldList; -import eu.dnetlib.pace.model.FieldListImpl; /** * Set of common functions for the framework @@ -51,28 +49,25 @@ public abstract class AbstractPaceFunctions { protected static Set ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt"); // html regex for normalization - public final String HTML_REGEX = "<[^>]*>"; + public static final Pattern HTML_REGEX = Pattern.compile("<[^>]*>"); private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 "; private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń"; private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn"; // doi prefix for normalization - public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)"; + public static final Pattern DOI_PREFIX = Pattern.compile("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)"); - private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?"); + private static Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?"); - private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})"); - - protected final static FieldList EMPTY_FIELD = new FieldListImpl(); + private static Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})"); protected String concat(final List l) { return Joiner.on(" ").skipNulls().join(l); } protected String cleanup(final String s) { - - final String s1 = s.replaceAll(HTML_REGEX, ""); + final String s1 = HTML_REGEX.matcher(s).replaceAll(""); final String s2 = unicodeNormalization(s1.toLowerCase()); final String s3 = nfd(s2); final String s4 = fixXML(s3); @@ -162,11 +157,6 @@ public abstract class AbstractPaceFunctions { return sb.toString().replaceAll("\\s+", " "); } - protected String getFirstValue(final Field values) { - return (values != null) && !Iterables.isEmpty(values) ? Iterables.getFirst(values, EMPTY_FIELD).stringValue() - : ""; - } - protected boolean notNull(final String s) { return s != null; } @@ -316,7 +306,7 @@ public abstract class AbstractPaceFunctions { } public String normalizePid(String pid) { - return pid.toLowerCase().replaceAll(DOI_PREFIX, ""); + return DOI_PREFIX.matcher(pid.toLowerCase()).replaceAll(""); } // get the list of keywords into the input string diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java index 0623b468f..4d823d129 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java @@ -3,7 +3,7 @@ package eu.dnetlib.pace.config; import java.util.List; import java.util.Map; -import java.util.regex.Pattern; +import java.util.function.Predicate; import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.FieldDef; @@ -30,13 +30,6 @@ public interface Config { */ public Map decisionTree(); - /** - * Field configuration definitions. - * - * @return the list of definitions - */ - public Map modelMap(); - /** * Clusterings. * @@ -49,7 +42,7 @@ public interface Config { * * @return the map */ - public Map> blacklists(); + public Map> blacklists(); /** * Translation map. diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java index ee24ff476..ac0ef08e4 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java @@ -4,18 +4,19 @@ package eu.dnetlib.pace.config; import java.io.IOException; import java.io.Serializable; import java.nio.charset.StandardCharsets; +import java.util.AbstractMap; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.function.Predicate; import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; import java.util.stream.Collectors; import org.antlr.stringtemplate.StringTemplate; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.databind.ObjectMapper; @@ -27,9 +28,6 @@ import eu.dnetlib.pace.tree.support.TreeNodeDef; import eu.dnetlib.pace.util.PaceException; public class DedupConfig implements Config, Serializable { - - private static final Log log = LogFactory.getLog(DedupConfig.class); - private static String CONFIG_TEMPLATE = "dedupConfig.st"; private PaceConfig pace; @@ -37,7 +35,7 @@ public class DedupConfig implements Config, Serializable { private WfConfig wf; @JsonIgnore - private Map> blacklists; + private Map> blacklists; private static Map defaults = Maps.newHashMap(); @@ -72,19 +70,29 @@ public class DedupConfig implements Config, Serializable { .getBlacklists() .entrySet() .stream() + .map( + e -> new AbstractMap.SimpleEntry>(e.getKey(), + e + .getValue() + .stream() + .filter(s -> !StringUtils.isBlank(s)) + .map(Pattern::compile) + .collect(Collectors.toList()))) .collect( Collectors .toMap( e -> e.getKey(), - e -> e + e -> (Predicate & Serializable) s -> e .getValue() .stream() - .filter(s -> !StringUtils.isBlank(s)) - .map(Pattern::compile) - .collect(Collectors.toList()))); + .filter(p -> p.matcher(s).matches()) + .findFirst() + .isPresent())) + + ; return config; - } catch (IOException e) { + } catch (IOException | PatternSyntaxException e) { throw new PaceException("Error in parsing configuration json", e); } @@ -152,18 +160,13 @@ public class DedupConfig implements Config, Serializable { return getPace().getModel(); } - @Override - public Map modelMap() { - return getPace().getModelMap(); - } - @Override public List clusterings() { return getPace().getClustering(); } @Override - public Map> blacklists() { + public Map> blacklists() { return blacklists; } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java index b4afad9c8..f1bc49f4a 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java @@ -28,6 +28,10 @@ public class PaceConfig extends AbstractPaceFunctions implements Serializable { @JsonIgnore private Map translationMap; + public Map getModelMap() { + return modelMap; + } + @JsonIgnore private Map modelMap; @@ -101,13 +105,4 @@ public class PaceConfig extends AbstractPaceFunctions implements Serializable { public void setSynonyms(Map> synonyms) { this.synonyms = synonyms; } - - public Map getModelMap() { - return modelMap; - } - - public void setModelMap(final Map modelMap) { - this.modelMap = modelMap; - } - } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/AbstractField.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/AbstractField.java deleted file mode 100644 index c11d461ab..000000000 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/AbstractField.java +++ /dev/null @@ -1,72 +0,0 @@ - -package eu.dnetlib.pace.model; - -import eu.dnetlib.pace.config.Type; - -/** - * The Class AbstractField. - */ -public abstract class AbstractField implements Field { - - /** The type. */ - protected Type type = Type.String; - - /** The name. */ - protected String name; - - /** - * Instantiates a new abstract field. - */ - protected AbstractField() { - } - - /** - * Instantiates a new abstract field. - * - * @param type - * the type - * @param name - * the name - */ - protected AbstractField(final Type type, final String name) { - this.type = type; - this.name = name; - } - - /* - * (non-Javadoc) - * @see eu.dnetlib.pace.model.Field#getName() - */ - @Override - public String getName() { - return name; - } - - /* - * (non-Javadoc) - * @see eu.dnetlib.pace.model.Field#getType() - */ - @Override - public Type getType() { - return type; - } - - /* - * (non-Javadoc) - * @see eu.dnetlib.pace.model.Field#setName(java.lang.String) - */ - @Override - public void setName(final String name) { - this.name = name; - } - - /* - * (non-Javadoc) - * @see eu.dnetlib.pace.model.Field#setType(eu.dnetlib.pace.config.Type) - */ - @Override - public void setType(final Type type) { - this.type = type; - } - -} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Document.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Document.java deleted file mode 100644 index d9c06d4e4..000000000 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Document.java +++ /dev/null @@ -1,40 +0,0 @@ - -package eu.dnetlib.pace.model; - -import java.util.Set; - -/** - * The Interface Document. Models the common operations available on a Pace Document. - */ -public interface Document { - - /** - * Gets the identifier. - * - * @return the identifier - */ - String getIdentifier(); - - /** - * Fields. - * - * @return the iterable - */ - Iterable fields(); - - /** - * Values. - * - * @param name - * the name - * @return the field list - */ - Field values(String name); - - /** - * Field names. - * - * @return the sets the - */ - Set fieldNames(); -} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Field.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Field.java deleted file mode 100644 index d5712cf2f..000000000 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Field.java +++ /dev/null @@ -1,57 +0,0 @@ - -package eu.dnetlib.pace.model; - -import java.io.Serializable; - -import eu.dnetlib.pace.config.Type; - -/** - * The Interface Field. - */ -public interface Field extends Iterable, Serializable { - - /** - * Gets the name. - * - * @return the name - */ - public String getName(); - - /** - * Sets the name. - * - * @param name - * the new name - */ - public void setName(String name); - - /** - * Gets the type. - * - * @return the type - */ - public Type getType(); - - /** - * Sets the type. - * - * @param type - * the new type - */ - public void setType(Type type); - - /** - * Checks if is empty. - * - * @return true, if is empty - */ - public boolean isEmpty(); - - /** - * String value. - * - * @return the string - */ - public String stringValue(); - -} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java index 8b123f2d5..f34545e6d 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java @@ -39,20 +39,6 @@ public class FieldDef implements Serializable { public FieldDef() { } - // def apply(s: String): Field[A] - public Field apply(final Type type, final String s) { - switch (type) { - case Int: - return new FieldValueImpl(type, name, Integer.parseInt(s)); - case String: - return new FieldValueImpl(type, name, s); - case List: - return new FieldListImpl(name, type); - default: - throw new IllegalArgumentException("Casting not implemented for type " + type); - } - } - public String getName() { return name; } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldList.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldList.java deleted file mode 100644 index b47795d8b..000000000 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldList.java +++ /dev/null @@ -1,25 +0,0 @@ - -package eu.dnetlib.pace.model; - -import java.util.List; - -/** - * The Interface FieldList. - */ -public interface FieldList extends List, Field { - - /** - * String list. - * - * @return the list - */ - public List stringList(); - - /** - * Double[] Array - * - * @return the double[] array - */ - public double[] doubleArray(); - -} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java deleted file mode 100644 index ca23a0bfc..000000000 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java +++ /dev/null @@ -1,315 +0,0 @@ - -package eu.dnetlib.pace.model; - -import java.util.Collection; -import java.util.Iterator; -import java.util.List; -import java.util.ListIterator; - -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.base.Function; -import com.google.common.base.Joiner; -import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; - -import eu.dnetlib.pace.config.Type; - -/** - * The Class FieldListImpl. - */ -public class FieldListImpl extends AbstractField implements FieldList { - - /** The fields. */ - private List fields; - - /** - * Instantiates a new field list impl. - */ - public FieldListImpl() { - fields = Lists.newArrayList(); - } - - /** - * Instantiates a new field list impl. - * - * @param name - * the name - */ - public FieldListImpl(final String name, final Type type) { - super(type, name); - fields = Lists.newArrayList(); - } - - /* - * (non-Javadoc) - * @see java.util.List#add(java.lang.Object) - */ - @Override - public boolean add(final Field f) { - return fields.add(f); - } - - /* - * (non-Javadoc) - * @see java.util.List#add(int, java.lang.Object) - */ - @Override - public void add(final int i, final Field f) { - fields.add(i, f); - } - - /* - * (non-Javadoc) - * @see java.util.List#addAll(java.util.Collection) - */ - @Override - public boolean addAll(final Collection f) { - return fields.addAll(f); - } - - /* - * (non-Javadoc) - * @see java.util.List#addAll(int, java.util.Collection) - */ - @Override - public boolean addAll(final int i, final Collection f) { - return fields.addAll(i, f); - } - - /* - * (non-Javadoc) - * @see java.util.List#clear() - */ - @Override - public void clear() { - fields.clear(); - } - - /* - * (non-Javadoc) - * @see java.util.List#contains(java.lang.Object) - */ - @Override - public boolean contains(final Object o) { - return fields.contains(o); - } - - /* - * (non-Javadoc) - * @see java.util.List#containsAll(java.util.Collection) - */ - @Override - public boolean containsAll(final Collection f) { - return fields.containsAll(f); - } - - /* - * (non-Javadoc) - * @see java.util.List#get(int) - */ - @Override - public Field get(final int i) { - return fields.get(i); - } - - /* - * (non-Javadoc) - * @see java.util.List#indexOf(java.lang.Object) - */ - @Override - public int indexOf(final Object o) { - return fields.indexOf(o); - } - - /* - * (non-Javadoc) - * @see eu.dnetlib.pace.model.Field#isEmpty() - */ - @Override - public boolean isEmpty() { - return Iterables.all(fields, f -> f.isEmpty()); - } - - /* - * (non-Javadoc) - * @see java.lang.Iterable#iterator() - */ - @Override - public Iterator iterator() { - return fields.iterator(); - } - - /* - * (non-Javadoc) - * @see java.util.List#lastIndexOf(java.lang.Object) - */ - @Override - public int lastIndexOf(final Object o) { - return fields.lastIndexOf(o); - } - - /* - * (non-Javadoc) - * @see java.util.List#listIterator() - */ - @Override - public ListIterator listIterator() { - return fields.listIterator(); - } - - /* - * (non-Javadoc) - * @see java.util.List#listIterator(int) - */ - @Override - public ListIterator listIterator(final int i) { - return fields.listIterator(i); - } - - /* - * (non-Javadoc) - * @see java.util.List#remove(java.lang.Object) - */ - @Override - public boolean remove(final Object o) { - return fields.remove(o); - } - - /* - * (non-Javadoc) - * @see java.util.List#remove(int) - */ - @Override - public Field remove(final int i) { - return fields.remove(i); - } - - /* - * (non-Javadoc) - * @see java.util.List#removeAll(java.util.Collection) - */ - @Override - public boolean removeAll(final Collection f) { - return fields.removeAll(f); - } - - /* - * (non-Javadoc) - * @see java.util.List#retainAll(java.util.Collection) - */ - @Override - public boolean retainAll(final Collection f) { - return fields.retainAll(f); - } - - /* - * (non-Javadoc) - * @see java.util.List#set(int, java.lang.Object) - */ - @Override - public Field set(final int i, final Field f) { - return fields.set(i, f); - } - - /* - * (non-Javadoc) - * @see java.util.List#size() - */ - @Override - public int size() { - return fields.size(); - } - - /* - * (non-Javadoc) - * @see java.util.List#subList(int, int) - */ - @Override - public List subList(final int from, final int to) { - return fields.subList(from, to); - } - - /* - * (non-Javadoc) - * @see java.util.List#toArray() - */ - @Override - public Object[] toArray() { - return fields.toArray(); - } - - /* - * (non-Javadoc) - * @see java.util.List#toArray(java.lang.Object[]) - */ - @Override - public T[] toArray(final T[] t) { - return fields.toArray(t); - } - - /* - * (non-Javadoc) - * @see eu.dnetlib.pace.model.Field#stringValue() - */ - @Override - public String stringValue() { - switch (getType()) { - - case List: - case Int: - case String: - return Joiner.on(" ").join(stringList()); - case JSON: - String json; - try { - json = new ObjectMapper().writeValueAsString(this); - } catch (JsonProcessingException e) { - json = null; - } - return json; - default: - throw new IllegalArgumentException("Unknown type: " + getType().toString()); - } - } - - /* - * (non-Javadoc) - * @see eu.dnetlib.pace.model.FieldList#stringList() - */ - @Override - public List stringList() { - return Lists.newArrayList(Iterables.transform(fields, getValuesTransformer())); - } - - private Function getValuesTransformer() { - return new Function() { - - @Override - public String apply(final Field f) { - return f.stringValue(); - } - }; - } - - @Override - public double[] doubleArray() { - return Lists.newArrayList(Iterables.transform(fields, getDouble())).stream().mapToDouble(d -> d).toArray(); - } - - private Function getDouble() { - - return new Function() { - @Override - public Double apply(final Field f) { - return Double.parseDouble(f.stringValue()); - } - }; - } - - @Override - public String toString() { - return stringList().toString(); - } - -} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java deleted file mode 100644 index b20f21a5c..000000000 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java +++ /dev/null @@ -1,26 +0,0 @@ - -package eu.dnetlib.pace.model; - -/** - * The Interface FieldValue. - */ -public interface FieldValue extends Field { - - /** - * Gets the value. - * - * @return the value - */ - public Object getValue(); - - /** - * Sets the value. - * - * @param value - * the new value - */ - public void setValue(final Object value); - - public double[] doubleArrayValue(); - -} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java deleted file mode 100644 index eff54abfb..000000000 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java +++ /dev/null @@ -1,135 +0,0 @@ - -package eu.dnetlib.pace.model; - -import java.net.MalformedURLException; -import java.net.URL; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; - -import org.apache.commons.lang3.StringUtils; - -import eu.dnetlib.pace.config.Type; - -/** - * The Class FieldValueImpl. - */ -public class FieldValueImpl extends AbstractField implements FieldValue { - - /** The value. */ - private Object value = null; - - /** - * Instantiates a new field value impl. - */ - public FieldValueImpl() { - } - - /** - * Instantiates a new field value impl. - * - * @param type - * the type - * @param name - * the name - * @param value - * the value - */ - public FieldValueImpl(final Type type, final String name, final Object value) { - super(type, name); - this.value = value; - } - - /* - * (non-Javadoc) - * @see eu.dnetlib.pace.model.Field#isEmpty() - */ - @Override - public boolean isEmpty() { - if (value == null) - return false; - - switch (type) { - case String: - case JSON: - return value.toString().isEmpty(); - case List: - try { - List list = (List) value; - return list.isEmpty() || ((FieldValueImpl) list.get(0)).isEmpty(); - } catch (Exception e) { - throw new RuntimeException(value.toString()); - } - case URL: - String str = value.toString(); - return StringUtils.isBlank(str) || !isValidURL(str); - case DoubleArray: - return doubleArrayValue().length == 0; - default: - return true; - } - } - - private boolean isValidURL(final String s) { - try { - new URL(s); - return true; - } catch (MalformedURLException e) { - return false; - } - } - - /* - * (non-Javadoc) - * @see eu.dnetlib.pace.model.FieldValue#getValue() - */ - @Override - public Object getValue() { - return value; - } - - /* - * (non-Javadoc) - * @see eu.dnetlib.pace.model.FieldValue#setValue(java.lang.Object) - */ - @Override - public void setValue(final Object value) { - this.value = value; - } - - /* - * (non-Javadoc) - * @see eu.dnetlib.pace.model.Field#stringValue() - */ - @Override - // @SuppressWarnings("unchecked") - public String stringValue() { - return String.valueOf(getValue()); - // switch (getType()) { - // - // case Int: - // return String.valueOf(getValue()); - // case List: - // return Joiner.on(" ").join((List) getValue()); - // case String: - // return (String) getValue(); - // default: - // throw new IllegalArgumentException("Unknown type: " + getType().toString()); - // } - } - - public double[] doubleArrayValue() { - return (double[]) getValue(); - } - - /* - * (non-Javadoc) - * @see java.lang.Iterable#iterator() - */ - @Override - @SuppressWarnings("unchecked") - public Iterator iterator() { - return Collections.singleton((Field) this).iterator(); - } - -} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocument.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocument.java deleted file mode 100644 index c2860ca3b..000000000 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocument.java +++ /dev/null @@ -1,143 +0,0 @@ - -package eu.dnetlib.pace.model; - -import java.io.Serializable; -import java.util.Map; -import java.util.Set; - -import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - -/** - * The Class MapDocument. - */ -public class MapDocument implements Document, Serializable { - - /** The identifier. */ - private String identifier; - - /** The field map. */ - private Map fieldMap; - - /** - * Instantiates a new map document. - */ - public MapDocument() { - identifier = null; - fieldMap = Maps.newHashMap(); - } - - /** - * Instantiates a new map document. - * - * @param identifier - * the identifier - * @param fieldMap - * the field map - */ - public MapDocument(final String identifier, final Map fieldMap) { - this.setIdentifier(identifier); - this.fieldMap = fieldMap; - } - - /** - * Instantiates a new map document. - * - * @param identifier - * the identifier - * @param data - * the data - */ - public MapDocument(final String identifier, final byte[] data) { - final MapDocument doc = MapDocumentSerializer.decode(data); - - this.fieldMap = doc.fieldMap; - this.identifier = doc.identifier; - } - - /* - * (non-Javadoc) - * @see eu.dnetlib.pace.model.document.Document#fields() - */ - @Override - public Iterable fields() { - return Lists.newArrayList(Iterables.concat(fieldMap.values())); - } - - /* - * (non-Javadoc) - * @see eu.dnetlib.pace.model.document.Document#values(java.lang.String) - */ - @Override - public Field values(final String name) { - return fieldMap.get(name); - } - - /* - * (non-Javadoc) - * @see eu.dnetlib.pace.model.document.Document#fieldNames() - */ - @Override - public Set fieldNames() { - return fieldMap.keySet(); - } - - /* - * (non-Javadoc) - * @see java.lang.Object#toString() - */ - @Override - public String toString() { - return MapDocumentSerializer.toString(this); - // return String.format("Document(%s)", fieldMap.toString()); - } - - /** - * To byte array. - * - * @return the byte[] - */ - public byte[] toByteArray() { - return MapDocumentSerializer.toByteArray(this); - } - - /* - * (non-Javadoc) - * @see eu.dnetlib.pace.model.document.Document#getIdentifier() - */ - @Override - public String getIdentifier() { - return identifier; - } - - /** - * Sets the identifier. - * - * @param identifier - * the new identifier - */ - public void setIdentifier(final String identifier) { - this.identifier = identifier; - } - - /** - * Gets the field map. - * - * @return the field map - */ - public Map getFieldMap() { - return fieldMap; - } - - /** - * Sets the field map. - * - * @param fieldMap - * the field map - */ - public void setFieldMap(final Map fieldMap) { - this.fieldMap = fieldMap; - } - -} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentComparator.java deleted file mode 100644 index a77dcbc0c..000000000 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentComparator.java +++ /dev/null @@ -1,52 +0,0 @@ - -package eu.dnetlib.pace.model; - -import java.util.Comparator; - -import com.google.common.collect.Iterables; - -import eu.dnetlib.pace.clustering.NGramUtils; - -/** - * The Class MapDocumentComparator. - */ -public class MapDocumentComparator implements Comparator { - - /** The comparator field. */ - private String comparatorField; - - private final FieldList emptyField = new FieldListImpl(); - - /** - * Instantiates a new map document comparator. - * - * @param comparatorField - * the comparator field - */ - public MapDocumentComparator(final String comparatorField) { - this.comparatorField = comparatorField; - } - - /* - * (non-Javadoc) - * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) - */ - @Override - public int compare(final Document d1, final Document d2) { - - if (d1.values(comparatorField).isEmpty() || d2.values(comparatorField).isEmpty()) - return 0; - - final String o1 = Iterables.getFirst(d1.values(comparatorField), emptyField).stringValue(); - final String o2 = Iterables.getFirst(d2.values(comparatorField), emptyField).stringValue(); - - if ((o1 == null) || (o2 == null)) - return 0; - - final String to1 = NGramUtils.cleanupForOrdering(o1); - final String to2 = NGramUtils.cleanupForOrdering(o2); - - return to1.compareTo(to2); - } - -} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java deleted file mode 100644 index d71f780ad..000000000 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java +++ /dev/null @@ -1,103 +0,0 @@ - -package eu.dnetlib.pace.model; - -import java.lang.reflect.Type; - -import com.google.gson.GsonBuilder; -import com.google.gson.InstanceCreator; -import com.google.gson.JsonDeserializationContext; -import com.google.gson.JsonDeserializer; -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; -import com.google.gson.JsonParseException; - -/** - * The Class MapDocumentSerializer. - */ -public class MapDocumentSerializer implements InstanceCreator { - - @Override - public MapDocument createInstance(final Type type) { - return new MapDocument(); - } - - /** - * Decode. - * - * @param s - * the String - * @return the map document - */ - public static MapDocument decode(final String s) { - final GsonBuilder gson = new GsonBuilder(); - - gson.registerTypeAdapter(Field.class, new JsonDeserializer() { - - @Override - public Field deserialize(final JsonElement json, final Type typeOfT, - final JsonDeserializationContext context) throws JsonParseException { - final FieldListImpl fl = new FieldListImpl(); - if (json.isJsonObject()) { - - fl.add(handleJsonObject(json.getAsJsonObject())); - - } else if (json.isJsonArray()) { - - for (final JsonElement e : json.getAsJsonArray()) { - if (e.isJsonObject()) { - fl.add(handleJsonObject(e.getAsJsonObject())); - } - } - } - return fl; - } - - private Field handleJsonObject(final JsonObject o) { - final FieldListImpl fl = new FieldListImpl(); - final String name = o.get("name").getAsString(); - final String type = o.get("type").getAsString(); - final String value = o.get("value").getAsString(); - fl.add(new FieldValueImpl(eu.dnetlib.pace.config.Type.valueOf(type), name, value)); - return fl; - } - }); - - return gson.create().fromJson(s, MapDocument.class); - } - - /** - * Decode. - * - * @param bytes - * the bytes - * @return the map document - */ - public static MapDocument decode(final byte[] bytes) { - return decode(new String(bytes)); - } - - /** - * To string. - * - * @param doc - * the doc - * @return the string - */ - public static String toString(final MapDocument doc) { - final GsonBuilder b = new GsonBuilder(); - return b.setPrettyPrinting().create().toJson(doc); - - } - - /** - * To byte array. - * - * @param doc - * the doc - * @return the byte[] - */ - public static byte[] toByteArray(final MapDocument doc) { - return toString(doc).getBytes(); - } - -} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/RowDataOrderingComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/RowDataOrderingComparator.java new file mode 100644 index 000000000..f0ded0570 --- /dev/null +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/RowDataOrderingComparator.java @@ -0,0 +1,65 @@ + +package eu.dnetlib.pace.model; + +import java.util.Comparator; + +import org.apache.spark.sql.Row; + +import eu.dnetlib.pace.clustering.NGramUtils; + +/** + * The Class MapDocumentComparator. + */ +public class RowDataOrderingComparator implements Comparator { + + /** The comparator field. */ + private final int comparatorField; + private final int identityFieldPosition; + + /** + * Instantiates a new map document comparator. + * + * @param comparatorField + * the comparator field + */ + public RowDataOrderingComparator(final int comparatorField, int identityFieldPosition) { + this.comparatorField = comparatorField; + this.identityFieldPosition = identityFieldPosition; + } + + /* + * (non-Javadoc) + * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) + */ + @Override + public int compare(final Row d1, final Row d2) { + if (d1 == null) + return d2 == null ? 0 : -1; + else if (d2 == null) { + return 1; + } + + final String o1 = d1.getString(comparatorField); + final String o2 = d2.getString(comparatorField); + + if (o1 == null) + return o2 == null ? 0 : -1; + else if (o2 == null) { + return 1; + } + + final String to1 = NGramUtils.cleanupForOrdering(o1); + final String to2 = NGramUtils.cleanupForOrdering(o2); + + int res = to1.compareTo(to2); + if (res == 0) { + res = o1.compareTo(o2); + if (res == 0) { + return d1.getString(identityFieldPosition).compareTo(d2.getString(identityFieldPosition)); + } + } + + return res; + } + +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDeduper.scala b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDeduper.scala new file mode 100644 index 000000000..b3f56bcdb --- /dev/null +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDeduper.scala @@ -0,0 +1,131 @@ +package eu.dnetlib.pace.model + +import eu.dnetlib.pace.config.{DedupConfig, Type} +import eu.dnetlib.pace.util.{BlockProcessor, SparkReporter} +import org.apache.spark.SparkContext +import org.apache.spark.sql.catalyst.expressions.Literal +import org.apache.spark.sql.expressions._ +import org.apache.spark.sql.functions.{col, lit, udf} +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{Column, Dataset, Row, functions} + +import java.util.function.Predicate +import java.util.stream.Collectors +import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ +import scala.collection.mutable +case class SparkDeduper(conf: DedupConfig) extends Serializable { + + val model: SparkModel = SparkModel(conf) + + val dedup: (Dataset[Row] => Dataset[Row]) = df => { + df.transform(filterAndCleanup) + .transform(generateClustersWithCollect) + .transform(processBlocks) + } + + + val filterAndCleanup: (Dataset[Row] => Dataset[Row]) = df => { + val df_with_filters = conf.getPace.getModel.asScala.foldLeft(df)((res, fdef) => { + if (conf.blacklists.containsKey(fdef.getName)) { + res.withColumn( + fdef.getName + "_filtered", + filterColumnUDF(fdef).apply(new Column(fdef.getName)) + ) + } else { + res + } + }) + + df_with_filters + } + + def filterColumnUDF(fdef: FieldDef): UserDefinedFunction = { + val blacklist: Predicate[String] = conf.blacklists().get(fdef.getName) + + if (blacklist == null) { + throw new IllegalArgumentException("Column: " + fdef.getName + " does not have any filter") + } else { + fdef.getType match { + case Type.List | Type.JSON => + udf[Array[String], Array[String]](values => { + values.filter((v: String) => !blacklist.test(v)) + }) + + case _ => + udf[String, String](v => { + if (blacklist.test(v)) "" + else v + }) + } + } + } + + val generateClustersWithCollect: (Dataset[Row] => Dataset[Row]) = df_with_filters => { + var df_with_clustering_keys: Dataset[Row] = null + + for ((cd, idx) <- conf.clusterings().zipWithIndex) { + val inputColumns = cd.getFields().foldLeft(Seq[Column]())((acc, fName) => { + val column = if (conf.blacklists.containsKey(fName)) + Seq(col(fName + "_filtered")) + else + Seq(col(fName)) + + acc ++ column + }) + + // Add 'key' column with the value generated by the given clustering definition + val ds: Dataset[Row] = df_with_filters + .withColumn("clustering", lit(cd.getName + "::" + idx)) + .withColumn("key", functions.explode(clusterValuesUDF(cd).apply(functions.array(inputColumns: _*)))) + // Add position column having the position of the row within the set of rows having the same key value ordered by the sorting value + .withColumn("position", functions.row_number().over(Window.partitionBy("key").orderBy(col(model.orderingFieldName), col(model.identifierFieldName)))) + + if (df_with_clustering_keys == null) + df_with_clustering_keys = ds + else + df_with_clustering_keys = df_with_clustering_keys.union(ds) + } + + //TODO: analytics + + val df_with_blocks = df_with_clustering_keys + // filter out rows with position exceeding the maxqueuesize parameter + .filter(col("position").leq(conf.getWf.getQueueMaxSize)) + .groupBy("clustering", "key") + .agg(functions.collect_set(functions.struct(model.schema.fieldNames.map(col): _*)).as("block")) + .filter(functions.size(new Column("block")).gt(1)) + + df_with_blocks + } + + def clusterValuesUDF(cd: ClusteringDef) = { + udf[mutable.WrappedArray[String], mutable.WrappedArray[Any]](values => { + values.flatMap(f => cd.clusteringFunction().apply(conf, Seq(f.toString).asJava).asScala) + }) + } + + val processBlocks: (Dataset[Row] => Dataset[Row]) = df => { + df.filter(functions.size(new Column("block")).geq(new Literal(2, DataTypes.IntegerType))) + .withColumn("relations", processBlock(df.sqlContext.sparkContext).apply(new Column("block"))) + .select(functions.explode(new Column("relations")).as("relation")) + } + + def processBlock(implicit sc: SparkContext) = { + val accumulators = SparkReporter.constructAccumulator(conf, sc) + + udf[Array[(String, String)], mutable.WrappedArray[Row]](block => { + val reporter = new SparkReporter(accumulators) + + val mapDocuments = block.asJava.stream() + .sorted(new RowDataOrderingComparator(model.orderingFieldPosition, model.identityFieldPosition)) + .limit(conf.getWf.getQueueMaxSize) + .collect(Collectors.toList[Row]()) + + new BlockProcessor(conf, model.identityFieldPosition, model.orderingFieldPosition).processSortedRows(mapDocuments, reporter) + + reporter.getRelations.asScala.toArray + }).asNondeterministic() + } + +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala new file mode 100644 index 000000000..95325ace0 --- /dev/null +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala @@ -0,0 +1,108 @@ +package eu.dnetlib.pace.model + +import com.jayway.jsonpath.{Configuration, JsonPath} +import eu.dnetlib.pace.config.{DedupConfig, Type} +import eu.dnetlib.pace.util.MapDocumentUtil +import org.apache.spark.sql.catalyst.encoders.RowEncoder +import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} +import org.apache.spark.sql.{Dataset, Row} + +import java.util.regex.Pattern +import scala.collection.JavaConverters._ + +case class SparkModel(conf: DedupConfig) { + private val URL_REGEX: Pattern = Pattern.compile("^\\s*(http|https|ftp)\\://.*") + + private val CONCAT_REGEX: Pattern = Pattern.compile("\\|\\|\\|") + + val identifierFieldName = "identifier" + + val orderingFieldName = if (!conf.getWf.getOrderField.isEmpty) conf.getWf.getOrderField else identifierFieldName + + val schema: StructType = { + // create an implicit identifier field + val identifier = new FieldDef() + identifier.setName(identifierFieldName) + identifier.setType(Type.String) + + // Construct a Spark StructType representing the schema of the model + (Seq(identifier) ++ conf.getPace.getModel.asScala) + .foldLeft( + new StructType() + )((resType, fieldDef) => { + resType.add(fieldDef.getType match { + case Type.List | Type.JSON => + StructField(fieldDef.getName, DataTypes.createArrayType(DataTypes.StringType), true, Metadata.empty) + case Type.DoubleArray => + StructField(fieldDef.getName, DataTypes.createArrayType(DataTypes.DoubleType), true, Metadata.empty) + case _ => + StructField(fieldDef.getName, DataTypes.StringType, true, Metadata.empty) + }) + }) + + + } + + val identityFieldPosition: Int = schema.fieldIndex(identifierFieldName) + + val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName) + + val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => { + df.map(r => rowFromJson(r))(RowEncoder(schema)) + } + + def rowFromJson(json: String): Row = { + val documentContext = + JsonPath.using(Configuration.defaultConfiguration.addOptions(com.jayway.jsonpath.Option.SUPPRESS_EXCEPTIONS)).parse(json) + val values = new Array[Any](schema.size) + + values(identityFieldPosition) = MapDocumentUtil.getJPathString(conf.getWf.getIdPath, documentContext) + + schema.fieldNames.zipWithIndex.foldLeft(values) { + case ((res, (fname, index))) => { + val fdef = conf.getPace.getModelMap.get(fname) + + if (fdef != null) { + res(index) = fdef.getType match { + case Type.String | Type.Int => + MapDocumentUtil.truncateValue( + MapDocumentUtil.getJPathString(fdef.getPath, documentContext), + fdef.getLength + ) + + case Type.URL => + var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext) + if (!URL_REGEX.matcher(uv).matches) + uv = "" + uv + + case Type.List | Type.JSON => + MapDocumentUtil.truncateList( + MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType), + fdef.getSize + ).toArray + + case Type.StringConcat => + val jpaths = CONCAT_REGEX.split(fdef.getPath) + + MapDocumentUtil.truncateValue( + jpaths + .map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext)) + .mkString(" "), + fdef.getLength + ) + + case Type.DoubleArray => + MapDocumentUtil.getJPathArray(fdef.getPath, json) + } + } + + res + } + } + + new GenericRowWithSchema(values, schema) + } +} + diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java index f53655a8e..4d31df5b3 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java @@ -6,12 +6,11 @@ import java.util.Map; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("alwaysMatch") -public class AlwaysMatch extends AbstractComparator { +public class AlwaysMatch extends AbstractComparator { public AlwaysMatch(final Map params) { super(params, new com.wcohen.ss.JaroWinkler()); @@ -26,7 +25,7 @@ public class AlwaysMatch extends AbstractComparator { } @Override - public double compare(final Field a, final Field b, final Config conf) { + public double compare(final Object a, final Object b, final Config conf) { return 1.0; } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java index 047e121e3..5c6939e60 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java @@ -1,25 +1,19 @@ package eu.dnetlib.pace.tree; -import java.util.Comparator; import java.util.List; import java.util.Map; -import java.util.function.Function; import java.util.stream.Collectors; -import java.util.stream.Stream; -import com.google.common.collect.Iterables; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldList; import eu.dnetlib.pace.model.Person; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractListComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("authorsMatch") -public class AuthorsMatch extends AbstractComparator { +public class AuthorsMatch extends AbstractListComparator { Map params; @@ -49,24 +43,16 @@ public class AuthorsMatch extends AbstractComparator { } @Override - public double compare(final Field a, final Field b, final Config conf) { + public double compare(final List a, final List b, final Config conf) { if (a.isEmpty() || b.isEmpty()) return -1; - if (((FieldList) a).size() > SIZE_THRESHOLD || ((FieldList) b).size() > SIZE_THRESHOLD) + if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD) return 1.0; - List aList = ((FieldList) a) - .stringList() - .stream() - .map(author -> new Person(author, false)) - .collect(Collectors.toList()); - List bList = ((FieldList) b) - .stringList() - .stream() - .map(author -> new Person(author, false)) - .collect(Collectors.toList()); + List aList = a.stream().map(author -> new Person(author, false)).collect(Collectors.toList()); + List bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList()); common = 0; // compare each element of List1 with each element of List2 diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java index f3da29e8e..1d898ad83 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java @@ -5,11 +5,11 @@ import java.util.Map; import java.util.Set; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("cityMatch") -public class CityMatch extends AbstractComparator { +public class CityMatch extends AbstractStringComparator { private Map params; diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java index 82d84794f..d255612ba 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java @@ -1,21 +1,14 @@ package eu.dnetlib.pace.tree; -import java.util.HashMap; -import java.util.List; import java.util.Map; -import java.util.stream.Collectors; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldList; -import eu.dnetlib.pace.model.FieldValueImpl; -import eu.dnetlib.pace.model.Person; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("cosineSimilarity") -public class CosineSimilarity extends AbstractComparator { +public class CosineSimilarity extends AbstractComparator { Map params; @@ -24,15 +17,16 @@ public class CosineSimilarity extends AbstractComparator { } @Override - public double compare(final Field a, final Field b, final Config conf) { + public double compare(Object a, Object b, Config config) { + return compare((double[]) a, (double[]) b, config); + } - if (a.isEmpty() || b.isEmpty()) + public double compare(final double[] a, final double[] b, final Config conf) { + + if (a.length == 0 || b.length == 0) return -1; - double[] aVector = ((FieldValueImpl) a).doubleArrayValue(); - double[] bVector = ((FieldValueImpl) b).doubleArrayValue(); - - return cosineSimilarity(aVector, bVector); + return cosineSimilarity(a, b); } double cosineSimilarity(double[] a, double[] b) { diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java index 24f3dfc02..d3c5bc10d 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java @@ -3,7 +3,6 @@ package eu.dnetlib.pace.tree; import java.util.Map; -import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.tree.support.ComparatorClass; /** @@ -21,8 +20,8 @@ public class DoiExactMatch extends ExactMatchIgnoreCase { } @Override - protected String getValue(final Field f) { - return super.getValue(f).replaceAll(PREFIX, ""); + protected String toString(final Object f) { + return super.toString(f).replaceAll(PREFIX, ""); } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java index efafe6573..c28274652 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java @@ -5,7 +5,6 @@ import java.net.MalformedURLException; import java.net.URL; import java.util.Map; -import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("domainExactMatch") @@ -16,10 +15,10 @@ public class DomainExactMatch extends ExactMatchIgnoreCase { } @Override - protected String getValue(final Field f) { + protected String toString(final Object f) { try { - return asUrl(super.getValue(f)).getHost(); + return asUrl(super.toString(f)).getHost(); } catch (MalformedURLException e) { return ""; } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java index 85ce6744d..35357c553 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java @@ -6,11 +6,11 @@ import java.util.Map; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("exactMatch") -public class ExactMatch extends AbstractComparator { +public class ExactMatch extends AbstractStringComparator { public ExactMatch(Map params) { super(params, new com.wcohen.ss.JaroWinkler()); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java index 307f02246..85c57ad40 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java @@ -4,30 +4,26 @@ package eu.dnetlib.pace.tree; import java.util.Map; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("exactMatchIgnoreCase") -public class ExactMatchIgnoreCase extends AbstractComparator { +public class ExactMatchIgnoreCase extends AbstractStringComparator { public ExactMatchIgnoreCase(Map params) { super(params); } @Override - public double compare(Field a, Field b, final Config conf) { + public double compare(String a, String b, final Config conf) { - final String fa = getValue(a); - final String fb = getValue(b); - - if (fa.isEmpty() || fb.isEmpty()) + if (a.isEmpty() || b.isEmpty()) return -1; - return fa.equalsIgnoreCase(fb) ? 1 : 0; + return a.equalsIgnoreCase(b) ? 1 : 0; } - protected String getValue(final Field f) { - return getFirstValue(f); + protected String toString(final Object object) { + return toFirstString(object); } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java index bdef1225d..238cb16ce 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java @@ -10,13 +10,11 @@ import java.util.stream.Collectors; import com.google.common.collect.Sets; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldList; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractListComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("instanceTypeMatch") -public class InstanceTypeMatch extends AbstractComparator { +public class InstanceTypeMatch extends AbstractListComparator { final Map translationMap = new HashMap<>(); @@ -42,21 +40,18 @@ public class InstanceTypeMatch extends AbstractComparator { } @Override - public double compare(final Field a, final Field b, final Config conf) { + public double compare(final List a, final List b, final Config conf) { if (a == null || b == null) { return -1; } - final List sa = ((FieldList) a).stringList(); - final List sb = ((FieldList) b).stringList(); - - if (sa.isEmpty() || sb.isEmpty()) { + if (a.isEmpty() || b.isEmpty()) { return -1; } - final Set ca = sa.stream().map(this::translate).collect(Collectors.toSet()); - final Set cb = sb.stream().map(this::translate).collect(Collectors.toSet()); + final Set ca = a.stream().map(this::translate).collect(Collectors.toSet()); + final Set cb = b.stream().map(this::translate).collect(Collectors.toSet()); // if at least one is a jolly type, it must produce a match if (ca.contains("*") || cb.contains("*")) diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java index 7511e5ec9..2cb411d26 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java @@ -6,12 +6,12 @@ import java.util.Map; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; //case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) @ComparatorClass("jaroWinkler") -public class JaroWinkler extends AbstractComparator { +public class JaroWinkler extends AbstractStringComparator { public JaroWinkler(Map params) { super(params, new com.wcohen.ss.JaroWinkler()); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java index 4f4c68e47..576b9281d 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java @@ -7,11 +7,11 @@ import java.util.Set; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("jaroWinklerNormalizedName") -public class JaroWinklerNormalizedName extends AbstractComparator { +public class JaroWinklerNormalizedName extends AbstractStringComparator { private Map params; diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java index d97d8d061..6ba7dd2a4 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java @@ -6,12 +6,12 @@ import java.util.Map; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; //case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) @ComparatorClass("jaroWinklerTitle") -public class JaroWinklerTitle extends AbstractComparator { +public class JaroWinklerTitle extends AbstractStringComparator { public JaroWinklerTitle(Map params) { super(params, new com.wcohen.ss.JaroWinkler()); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java index e5c69a852..3897e37f8 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java @@ -10,16 +10,18 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.google.common.collect.Sets; +import com.jayway.jsonpath.Configuration; +import com.jayway.jsonpath.DocumentContext; +import com.jayway.jsonpath.JsonPath; +import com.jayway.jsonpath.Option; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldList; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractListComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.util.MapDocumentUtil; @ComparatorClass("jsonListMatch") -public class JsonListMatch extends AbstractComparator { +public class JsonListMatch extends AbstractListComparator { private static final Log log = LogFactory.getLog(JsonListMatch.class); private Map params; @@ -34,11 +36,7 @@ public class JsonListMatch extends AbstractComparator { } @Override - public double compare(final Field a, final Field b, final Config conf) { - - final List sa = ((FieldList) a).stringList(); - final List sb = ((FieldList) b).stringList(); - + public double compare(final List sa, final List sb, final Config conf) { if (sa.isEmpty() || sb.isEmpty()) { return -1; } @@ -65,14 +63,17 @@ public class JsonListMatch extends AbstractComparator { StringBuilder st = new StringBuilder(); // to build the string used for comparisons basing on the jpath into // parameters - + final DocumentContext documentContext = JsonPath + .using(Configuration.defaultConfiguration().addOptions(Option.SUPPRESS_EXCEPTIONS)) + .parse(json); // for each path in the param list for (String key : params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) { String path = params.get(key); - String value = MapDocumentUtil.getJPathString(path, json); + String value = MapDocumentUtil.getJPathString(path, documentContext); if (value == null || value.isEmpty()) value = ""; - st.append(value + "::"); + st.append(value); + st.append("::"); } st.setLength(st.length() - 2); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java index 0d69e5177..53acb4dc8 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java @@ -5,11 +5,11 @@ import java.util.Map; import java.util.Set; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("keywordMatch") -public class KeywordMatch extends AbstractComparator { +public class KeywordMatch extends AbstractStringComparator { Map params; diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java index d483049d7..970f975f6 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java @@ -5,11 +5,11 @@ import java.util.Map; import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("level2JaroWinkler") -public class Level2JaroWinkler extends AbstractComparator { +public class Level2JaroWinkler extends AbstractStringComparator { public Level2JaroWinkler(Map params) { super(params, new com.wcohen.ss.Level2JaroWinkler()); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java index a87a6079a..e351058f9 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java @@ -6,11 +6,11 @@ import java.util.Map; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("level2JaroWinklerTitle") -public class Level2JaroWinklerTitle extends AbstractComparator { +public class Level2JaroWinklerTitle extends AbstractStringComparator { public Level2JaroWinklerTitle(Map params) { super(params, new com.wcohen.ss.Level2JaroWinkler()); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java index 5ac19ee2e..e66602e4f 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java @@ -5,11 +5,11 @@ import java.util.Map; import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("level2Levenstein") -public class Level2Levenstein extends AbstractComparator { +public class Level2Levenstein extends AbstractStringComparator { public Level2Levenstein(Map params) { super(params, new com.wcohen.ss.Level2Levenstein()); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java index 4072f52aa..0871f8176 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java @@ -5,11 +5,11 @@ import java.util.Map; import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("levenstein") -public class Levenstein extends AbstractComparator { +public class Levenstein extends AbstractStringComparator { public Levenstein(Map params) { super(params, new com.wcohen.ss.Levenstein()); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java index 896e93f09..877cb95ab 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java @@ -9,11 +9,11 @@ import org.apache.commons.logging.LogFactory; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("levensteinTitle") -public class LevensteinTitle extends AbstractComparator { +public class LevensteinTitle extends AbstractStringComparator { private static final Log log = LogFactory.getLog(LevensteinTitle.class); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java index 796edf49e..341c0a62b 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java @@ -6,14 +6,14 @@ import java.util.Map; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; /** * Compared compare between two titles, ignoring version numbers. Suitable for Software entities. */ @ComparatorClass("levensteinTitleIgnoreVersion") -public class LevensteinTitleIgnoreVersion extends AbstractComparator { +public class LevensteinTitleIgnoreVersion extends AbstractStringComparator { public LevensteinTitleIgnoreVersion(Map params) { super(params, new com.wcohen.ss.Levenstein()); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ListContainsMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ListContainsMatch.java index 8abe37d96..059db8de5 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ListContainsMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ListContainsMatch.java @@ -3,15 +3,10 @@ package eu.dnetlib.pace.tree; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.stream.Collectors; -import com.google.common.collect.Sets; - import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldList; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractListComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; /** @@ -20,7 +15,7 @@ import eu.dnetlib.pace.tree.support.ComparatorClass; * @author miconis * */ @ComparatorClass("listContainsMatch") -public class ListContainsMatch extends AbstractComparator { +public class ListContainsMatch extends AbstractListComparator { private Map params; private boolean CASE_SENSITIVE; @@ -38,11 +33,7 @@ public class ListContainsMatch extends AbstractComparator { } @Override - public double compare(final Field a, final Field b, final Config conf) { - - List sa = ((FieldList) a).stringList(); - List sb = ((FieldList) b).stringList(); - + public double compare(List sa, List sb, Config conf) { if (sa.isEmpty() || sb.isEmpty()) { return -1; } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java index ee4b58d9c..b9d62cf16 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java @@ -6,11 +6,11 @@ import java.util.Map; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("mustBeDifferent") -public class MustBeDifferent extends AbstractComparator { +public class MustBeDifferent extends AbstractStringComparator { public MustBeDifferent(Map params) { super(params, new com.wcohen.ss.Levenstein()); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java index 8b400122f..3ae1dcde0 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java @@ -4,7 +4,6 @@ package eu.dnetlib.pace.tree; import java.util.Map; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.tree.support.Comparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @@ -13,13 +12,13 @@ import eu.dnetlib.pace.tree.support.ComparatorClass; * NullDistanceAlgo. */ @ComparatorClass("null") -public class NullDistanceAlgo implements Comparator { +public class NullDistanceAlgo implements Comparator { public NullDistanceAlgo(Map params) { } @Override - public double compare(Field a, Field b, Config config) { + public double compare(Object a, Object b, Config config) { return 0; } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java index ebe25bab4..2c003a170 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java @@ -4,11 +4,11 @@ package eu.dnetlib.pace.tree; import java.util.Map; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("numbersComparator") -public class NumbersComparator extends AbstractComparator { +public class NumbersComparator extends AbstractStringComparator { Map params; diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersMatch.java index 52f99d018..820436d2e 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersMatch.java @@ -4,11 +4,11 @@ package eu.dnetlib.pace.tree; import java.util.Map; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("numbersMatch") -public class NumbersMatch extends AbstractComparator { +public class NumbersMatch extends AbstractStringComparator { public NumbersMatch(Map params) { super(params); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/RomansMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/RomansMatch.java index 08e4d5d84..a7c580973 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/RomansMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/RomansMatch.java @@ -4,11 +4,11 @@ package eu.dnetlib.pace.tree; import java.util.Map; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("romansMatch") -public class RomansMatch extends AbstractComparator { +public class RomansMatch extends AbstractStringComparator { public RomansMatch(Map params) { super(params); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java index 01cb3dd63..fb99ddb14 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java @@ -4,11 +4,8 @@ package eu.dnetlib.pace.tree; import java.util.List; import java.util.Map; -import com.google.common.collect.Iterables; - import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractListComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; /** @@ -17,7 +14,7 @@ import eu.dnetlib.pace.tree.support.ComparatorClass; * @author claudio */ @ComparatorClass("sizeMatch") -public class SizeMatch extends AbstractComparator { +public class SizeMatch extends AbstractListComparator { /** * Instantiates a new size match. @@ -30,23 +27,12 @@ public class SizeMatch extends AbstractComparator { } @Override - public double compare(final Field a, final Field b, final Config conf) { + public double compare(final List a, final List b, final Config conf) { if (a.isEmpty() || b.isEmpty()) - return -1; + return -1.0; - return Iterables.size(a) == Iterables.size(b) ? 1 : 0; - } - - /** - * Checks if is empty. - * - * @param a - * the a - * @return true, if is empty - */ - protected boolean isEmpty(final Iterable a) { - return (a == null) || Iterables.isEmpty(a); + return a.size() == b.size() ? 1.0 : 0.0; } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java index cef6de504..bca417b60 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java @@ -4,7 +4,7 @@ package eu.dnetlib.pace.tree; import java.util.Map; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; /** @@ -13,7 +13,7 @@ import eu.dnetlib.pace.tree.support.ComparatorClass; * @author miconis * */ @ComparatorClass("stringContainsMatch") -public class StringContainsMatch extends AbstractComparator { +public class StringContainsMatch extends AbstractStringComparator { private Map params; diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java index c74deadc9..b4dbef3bb 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java @@ -2,6 +2,7 @@ package eu.dnetlib.pace.tree; import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; @@ -11,13 +12,11 @@ import org.apache.commons.logging.LogFactory; import com.google.common.collect.Sets; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldList; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractListComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("stringListMatch") -public class StringListMatch extends AbstractComparator { +public class StringListMatch extends AbstractListComparator { private static final Log log = LogFactory.getLog(StringListMatch.class); private Map params; @@ -32,10 +31,10 @@ public class StringListMatch extends AbstractComparator { } @Override - public double compare(final Field a, final Field b, final Config conf) { + public double compare(final List a, final List b, final Config conf) { - final Set pa = new HashSet<>(((FieldList) a).stringList()); - final Set pb = new HashSet<>(((FieldList) b).stringList()); + final Set pa = new HashSet<>(a); + final Set pb = new HashSet<>(b); if (pa.isEmpty() || pb.isEmpty()) { return -1; // return undefined if one of the two lists is empty diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java index 23be3f752..3f8c40599 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java @@ -8,25 +8,24 @@ import org.apache.commons.lang3.StringUtils; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.config.Type; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; /** * The Class SubStringLevenstein. */ @ComparatorClass("subStringLevenstein") -public class SubStringLevenstein extends AbstractComparator { +public class SubStringLevenstein extends AbstractStringComparator { - /** The limit. */ + /** + * The limit. + */ protected int limit; /** * Instantiates a new sub string levenstein. - * - * @param w - * the w + * + * @param w the w */ public SubStringLevenstein(final double w) { super(w, new com.wcohen.ss.Levenstein()); @@ -39,11 +38,9 @@ public class SubStringLevenstein extends AbstractComparator { /** * Instantiates a new sub string levenstein. - * - * @param w - * the w - * @param limit - * the limit + * + * @param w the w + * @param limit the limit */ public SubStringLevenstein(final double w, final int limit) { super(w, new com.wcohen.ss.Levenstein()); @@ -52,13 +49,10 @@ public class SubStringLevenstein extends AbstractComparator { /** * Instantiates a new sub string levenstein. - * - * @param w - * the w - * @param limit - * the limit - * @param ssalgo - * the ssalgo + * + * @param w the w + * @param limit the limit + * @param ssalgo the ssalgo */ protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) { super(w, ssalgo); @@ -71,11 +65,8 @@ public class SubStringLevenstein extends AbstractComparator { * eu.dnetlib.pace.model.Field) */ @Override - public double distance(final Field a, final Field b, final Config conf) { - if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) - return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit), conf); - - throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); + public double distance(final String a, final String b, final Config conf) { + return distance(StringUtils.left(a, limit), StringUtils.left(b, limit), conf); } /* diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java index db1faf9e2..8d99ac27f 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java @@ -1,12 +1,10 @@ package eu.dnetlib.pace.tree; -import java.util.List; import java.util.Map; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; /** @@ -16,17 +14,14 @@ import eu.dnetlib.pace.tree.support.ComparatorClass; * */ @ComparatorClass("titleVersionMatch") -public class TitleVersionMatch extends AbstractComparator { +public class TitleVersionMatch extends AbstractStringComparator { public TitleVersionMatch(final Map params) { super(params); } @Override - public double compare(final Field a, final Field b, final Config conf) { - final String valueA = getFirstValue(a); - final String valueB = getFirstValue(b); - + public double compare(final String valueA, final String valueB, final Config conf) { if (valueA.isEmpty() || valueB.isEmpty()) return -1; @@ -38,4 +33,7 @@ public class TitleVersionMatch extends AbstractComparator { return getClass().getSimpleName() + ":" + super.toString(); } + protected String toString(final Object object) { + return toFirstString(object); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java index f4f00a908..722236be6 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java @@ -8,7 +8,6 @@ import java.util.Map; import org.apache.commons.lang3.StringUtils; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("urlMatcher") @@ -31,9 +30,9 @@ public class UrlMatcher extends Levenstein { } @Override - public double distance(Field a, Field b, final Config conf) { - final URL urlA = asUrl(getFirstValue(a)); - final URL urlB = asUrl(getFirstValue(b)); + public double distance(String a, String b, final Config conf) { + final URL urlA = asUrl(a); + final URL urlB = asUrl(b); if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) { return 0.0; @@ -58,4 +57,7 @@ public class UrlMatcher extends Levenstein { } } + protected String toString(final Object object) { + return toFirstString(object); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java index 7ee8c8bad..95f796f6a 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java @@ -6,8 +6,7 @@ import java.util.Map; import org.apache.commons.lang3.StringUtils; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; /** @@ -16,7 +15,7 @@ import eu.dnetlib.pace.tree.support.ComparatorClass; * @author claudio */ @ComparatorClass("yearMatch") -public class YearMatch extends AbstractComparator { +public class YearMatch extends AbstractStringComparator { private int limit = 4; @@ -25,7 +24,7 @@ public class YearMatch extends AbstractComparator { } @Override - public double compare(final Field a, final Field b, final Config conf) { + public double compare(final String a, final String b, final Config conf) { final String valueA = getNumbers(getFirstValue(a)); final String valueB = getNumbers(getFirstValue(b)); @@ -42,8 +41,8 @@ public class YearMatch extends AbstractComparator { return s.length() == limit; } - protected String getFirstValue(final Field value) { - return (value != null) && !value.isEmpty() ? StringUtils.left(value.stringValue(), limit) : ""; + protected String getFirstValue(final String value) { + return (value != null) && !value.isEmpty() ? StringUtils.left(value, limit) : ""; } @Override diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java index 3ecffb289..8a957c5e3 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java @@ -4,15 +4,14 @@ package eu.dnetlib.pace.tree.support; import java.util.List; import java.util.Map; +import com.google.common.base.Joiner; +import com.google.common.collect.Lists; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.config.Type; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldList; -public abstract class AbstractComparator extends AbstractPaceFunctions implements Comparator { +public abstract class AbstractComparator extends AbstractPaceFunctions implements Comparator { /** The ssalgo. */ protected AbstractStringDistance ssalgo; @@ -69,8 +68,8 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement * the b * @return the double */ - public double distance(final String a, final String b, final Config conf) { + protected double distance(final String a, final String b, final Config conf) { if (a.isEmpty() || b.isEmpty()) { return -1; // return -1 if a field is missing } @@ -78,49 +77,50 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement return normalize(score); } - /** - * Distance. - * - * @param a - * the a - * @param b - * the b - * @return the double - */ - protected double distance(final List a, final List b, final Config conf) { - return distance(concat(a), concat(b), conf); - } - - public double distance(final Field a, final Field b, final Config conf) { - if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) - return distance(a.stringValue(), b.stringValue(), conf); - if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) - return distance(toList(a), toList(b), conf); - - throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); - } - - @Override - public double compare(final Field a, final Field b, final Config conf) { + protected double compare(final String a, final String b, final Config conf) { if (a.isEmpty() || b.isEmpty()) return -1; - if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) - return distance(a.stringValue(), b.stringValue(), conf); - if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) - return distance(toList(a), toList(b), conf); - - throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); + return distance(a, b, conf); } /** - * To list. + * Convert the given argument to a List of Strings * - * @param list - * the list + * @param object + * function argument * @return the list */ - protected List toList(final Field list) { - return ((FieldList) list).stringList(); + protected List toList(final Object object) { + if (object instanceof List) { + return (List) object; + } + + return Lists.newArrayList(object.toString()); + } + + /** + * Convert the given argument to a String + * + * @param object + * function argument + * @return the list + */ + protected String toString(final Object object) { + if (object instanceof List) { + List l = (List) object; + return Joiner.on(" ").join(l); + } + + return object.toString(); + } + + protected String toFirstString(final Object object) { + if (object instanceof List) { + List l = (List) object; + return l.isEmpty() ? "" : l.get(0); + } + + return object.toString(); } public double getWeight() { diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractListComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractListComparator.java new file mode 100644 index 000000000..3f35350bd --- /dev/null +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractListComparator.java @@ -0,0 +1,39 @@ + +package eu.dnetlib.pace.tree.support; + +import java.util.List; +import java.util.Map; + +import com.wcohen.ss.AbstractStringDistance; + +import eu.dnetlib.pace.config.Config; + +abstract public class AbstractListComparator extends AbstractComparator> { + protected AbstractListComparator(Map params) { + super(params); + } + + protected AbstractListComparator(Map params, AbstractStringDistance ssalgo) { + super(params, ssalgo); + } + + protected AbstractListComparator(double weight, AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + protected AbstractListComparator(AbstractStringDistance ssalgo) { + super(ssalgo); + } + + @Override + public double compare(Object a, Object b, Config conf) { + return compare(toList(a), toList(b), conf); + } + + public double compare(final List a, final List b, final Config conf) { + if (a.isEmpty() || b.isEmpty()) + return -1; + + return distance(concat(a), concat(b), conf); + } +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java index 8927f2e14..06c806b92 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java @@ -8,10 +8,7 @@ import java.util.Map; import com.google.common.collect.Lists; import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldList; - -public abstract class AbstractSortedComparator extends AbstractComparator { +public abstract class AbstractSortedComparator extends AbstractListComparator { /** * Instantiates a new sorted second string compare algo. @@ -30,11 +27,14 @@ public abstract class AbstractSortedComparator extends AbstractComparator { } @Override - protected List toList(final Field list) { - FieldList fl = (FieldList) list; - List values = Lists.newArrayList(fl.stringList()); - Collections.sort(values); - return values; - } + protected List toList(final Object object) { + if (object instanceof List) { + List fl = (List) object; + List values = Lists.newArrayList(fl); + Collections.sort(values); + return values; + } + return Lists.newArrayList(object.toString()); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractStringComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractStringComparator.java new file mode 100644 index 000000000..037ff6634 --- /dev/null +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractStringComparator.java @@ -0,0 +1,46 @@ + +package eu.dnetlib.pace.tree.support; + +import java.util.Map; + +import com.wcohen.ss.AbstractStringDistance; + +import eu.dnetlib.pace.config.Config; + +public abstract class AbstractStringComparator extends AbstractComparator { + protected AbstractStringComparator(Map params) { + super(params); + } + + protected AbstractStringComparator(Map params, AbstractStringDistance ssalgo) { + super(params, ssalgo); + } + + protected AbstractStringComparator(double weight, AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + protected AbstractStringComparator(AbstractStringDistance ssalgo) { + super(ssalgo); + } + + public double distance(final String a, final String b, final Config conf) { + if (a.isEmpty() || b.isEmpty()) { + return -1; // return -1 if a field is missing + } + double score = ssalgo.score(a, b); + return normalize(score); + } + + @Override + public double compare(Object a, Object b, Config conf) { + return compare(toString(a), toString(b), conf); + } + + public double compare(final String a, final String b, final Config conf) { + if (a.isEmpty() || b.isEmpty()) + return -1; + return distance(a, b, conf); + } + +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java index b11ca5429..15a39921b 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java @@ -2,13 +2,11 @@ package eu.dnetlib.pace.tree.support; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; -public interface Comparator { +public interface Comparator { /* * return : -1 -> can't decide (i.e. missing field) >0 -> similarity degree (depends on the algorithm) */ - public double compare(Field a, Field b, Config conf); - + public double compare(Object a, Object b, Config conf); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java index 0d5c80f53..46e66378e 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java @@ -6,7 +6,6 @@ import java.io.Serializable; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.util.PaceException; /** @@ -17,12 +16,12 @@ public class FieldStats implements Serializable { private double weight; // weight for the field (to be used in the aggregation) private double threshold; // threshold for the field (to be used in some kind of aggregations) private double result; // the result of the comparison - private Field a; - private Field b; + private Object a; + private Object b; private boolean countIfUndefined; - public FieldStats(double weight, double threshold, double result, boolean countIfUndefined, Field a, Field b) { + public FieldStats(double weight, double threshold, double result, boolean countIfUndefined, Object a, Object b) { this.weight = weight; this.threshold = threshold; this.result = result; @@ -63,19 +62,19 @@ public class FieldStats implements Serializable { this.countIfUndefined = countIfUndefined; } - public Field getA() { + public Object getA() { return a; } - public void setA(Field a) { + public void setA(Object a) { this.a = a; } - public Field getB() { + public Object getB() { return b; } - public void setB(Field b) { + public void setB(Object b) { this.b = b; } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java index 8dff818e8..60559412d 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java @@ -7,10 +7,19 @@ public enum MatchType { public static MatchType parse(String value) { - try { - return MatchType.valueOf(value); - } catch (IllegalArgumentException e) { - return MatchType.UNDEFINED; // return UNDEFINED if the enum is not parsable + if (MATCH.name().equals(value)) { + return MATCH; + } else if (NO_MATCH.name().equals(value)) { + return NO_MATCH; + } else { + return UNDEFINED; } + +// try { +// return MatchType.valueOf(value); +// } +// catch (IllegalArgumentException e) { +// return MatchType.UNDEFINED; //return UNDEFINED if the enum is not parsable +// } } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java index a754f13cd..0973fdf1e 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java @@ -3,17 +3,17 @@ package eu.dnetlib.pace.tree.support; import java.io.IOException; import java.io.Serializable; -import java.io.StringWriter; import java.util.List; -import org.apache.commons.lang3.StringUtils; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.types.ArrayType; +import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.StringType; -import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.PaceConfig; -import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.PaceException; public class TreeNodeDef implements Serializable { @@ -46,31 +46,27 @@ public class TreeNodeDef implements Serializable { } // function for the evaluation of the node - public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2, Config conf) { + public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) { TreeNodeStats stats = new TreeNodeStats(); // for each field in the node, it computes the for (FieldConf fieldConf : fields) { - double weight = fieldConf.getWeight(); - double result; + Object value1 = getJavaValue(doc1, fieldConf.getField()); + Object value2 = getJavaValue(doc2, fieldConf.getField()); + // if the param specifies a cross comparison (i.e. compare elements from different fields), compute the // result for both sides and return the maximum - if (fieldConf.getParams().keySet().stream().anyMatch(k -> k.contains(CROSS_COMPARE))) { - String crossField = fieldConf.getParams().get(CROSS_COMPARE); - double result1 = comparator(fieldConf) - .compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(crossField), conf); - double result2 = comparator(fieldConf) - .compare(doc1.getFieldMap().get(crossField), doc2.getFieldMap().get(fieldConf.getField()), conf); + String crossField = fieldConf.getParams().get(CROSS_COMPARE); + if (crossField != null) { + double result1 = comparator(fieldConf).compare(value1, getJavaValue(doc2, crossField), conf); + double result2 = comparator(fieldConf).compare(getJavaValue(doc1, crossField), value2, conf); result = Math.max(result1, result2); } else { - result = comparator(fieldConf) - .compare( - doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), - conf); + result = comparator(fieldConf).compare(value1, value2, conf); } stats @@ -81,13 +77,27 @@ public class TreeNodeDef implements Serializable { Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "1.0")), result, fieldConf.isCountIfUndefined(), - doc1.getFieldMap().get(fieldConf.getField()), - doc2.getFieldMap().get(fieldConf.getField()))); + value1, + value2)); } return stats; } + public Object getJavaValue(Row row, String name) { + int pos = row.fieldIndex(name); + if (pos >= 0) { + DataType dt = row.schema().fields()[pos].dataType(); + if (dt instanceof StringType) { + return row.getString(pos); + } else if (dt instanceof ArrayType) { + return row.getList(pos); + } + } + + return null; + } + private Comparator comparator(final FieldConf field) { return PaceConfig.resolver.getComparator(field.getComparator(), field.getParams()); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java index 04e16be34..263504dbb 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java @@ -3,9 +3,9 @@ package eu.dnetlib.pace.tree.support; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.spark.sql.Row; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.PaceException; /** @@ -21,72 +21,72 @@ public class TreeProcessor { this.config = config; } - public boolean compare(final MapDocument a, final MapDocument b) { + // row based copies + + public boolean compare(final Row a, final Row b) { // evaluate the decision tree return evaluateTree(a, b).getResult() == MatchType.MATCH; } - public TreeStats evaluateTree(final MapDocument doc1, final MapDocument doc2) { + public TreeStats evaluateTree(final Row doc1, final Row doc2) { TreeStats treeStats = new TreeStats(); - String current = "start"; + String nextNodeName = "start"; - while (MatchType.parse(current) == MatchType.UNDEFINED) { + do { - TreeNodeDef currentNode = config.decisionTree().get(current); + TreeNodeDef currentNode = config.decisionTree().get(nextNodeName); // throw an exception if the node doesn't exist if (currentNode == null) - throw new PaceException("Missing tree node: " + current); + throw new PaceException("Missing tree node: " + nextNodeName); TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config); - treeStats.addNodeStats(current, stats); + treeStats.addNodeStats(nextNodeName, stats); // if ignoreUndefined=false the miss is considered as undefined if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) { - current = currentNode.getUndefined(); + nextNodeName = currentNode.getUndefined(); } // if ignoreUndefined=true the miss is ignored and the score computed anyway else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) { - current = currentNode.getPositive(); + nextNodeName = currentNode.getPositive(); } else { - current = currentNode.getNegative(); + nextNodeName = currentNode.getNegative(); } - } + } while (MatchType.parse(nextNodeName) == MatchType.UNDEFINED); - treeStats.setResult(MatchType.parse(current)); + treeStats.setResult(MatchType.parse(nextNodeName)); return treeStats; } - public double computeScore(final MapDocument doc1, final MapDocument doc2) { - String current = "start"; + public double computeScore(final Row doc1, final Row doc2) { + String nextNodeName = "start"; double score = 0.0; - while (MatchType.parse(current) == MatchType.UNDEFINED) { + do { - TreeNodeDef currentNode = config.decisionTree().get(current); + TreeNodeDef currentNode = config.decisionTree().get(nextNodeName); // throw an exception if the node doesn't exist if (currentNode == null) - throw new PaceException("The Tree Node doesn't exist: " + current); + throw new PaceException("The Tree Node doesn't exist: " + nextNodeName); TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config); score = stats.getFinalScore(currentNode.getAggregation()); // if ignoreUndefined=false the miss is considered as undefined if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) { - current = currentNode.getUndefined(); + nextNodeName = currentNode.getUndefined(); } // if ignoreUndefined=true the miss is ignored and the score computed anyway else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) { - current = currentNode.getPositive(); + nextNodeName = currentNode.getPositive(); } else { - current = currentNode.getNegative(); + nextNodeName = currentNode.getNegative(); } - - } + } while (MatchType.parse(nextNodeName) == MatchType.UNDEFINED); return score; } - } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java index 4053a123c..c2b0ddda7 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java @@ -1,20 +1,19 @@ package eu.dnetlib.pace.util; -import java.util.*; +import java.util.ArrayList; +import java.util.List; import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.types.ArrayType; +import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.StringType; -import com.google.common.collect.Lists; - -import eu.dnetlib.pace.clustering.NGramUtils; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.WfConfig; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.model.MapDocumentComparator; import eu.dnetlib.pace.tree.support.TreeProcessor; public class BlockProcessor { @@ -25,6 +24,9 @@ public class BlockProcessor { private DedupConfig dedupConf; + private final int identifierFieldPos; + private final int orderFieldPos; + public static void constructAccumulator(final DedupConfig dedupConf) { accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1")); accumulators @@ -47,152 +49,80 @@ public class BlockProcessor { .add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold())); } - public BlockProcessor(DedupConfig dedupConf) { + public BlockProcessor(DedupConfig dedupConf, int identifierFieldPos, int orderFieldPos) { this.dedupConf = dedupConf; + this.identifierFieldPos = identifierFieldPos; + this.orderFieldPos = orderFieldPos; } - public void processSortedBlock(final String key, final List documents, final Reporter context) { + public void processSortedRows(final List documents, final Reporter context) { if (documents.size() > 1) { // log.info("reducing key: '" + key + "' records: " + q.size()); - process(prepare(documents), context); + processRows(documents, context); } else { context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); } } - public void process(final String key, final Iterable documents, final Reporter context) { + private void processRows(final List queue, final Reporter context) { - final Queue q = prepare(documents); + for (int pivotPos = 0; pivotPos < queue.size(); pivotPos++) { + final Row pivot = queue.get(pivotPos); - if (q.size() > 1) { -// log.info("reducing key: '" + key + "' records: " + q.size()); - process(simplifyQueue(q, key, context), context); - - } else { - context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); - } - } - - private Queue prepare(final Iterable documents) { - final Queue queue = new PriorityQueue<>(100, - new MapDocumentComparator(dedupConf.getWf().getOrderField())); - - final Set seen = new HashSet(); - final int queueMaxSize = dedupConf.getWf().getQueueMaxSize(); - - documents.forEach(doc -> { - if (queue.size() <= queueMaxSize) { - final String id = doc.getIdentifier(); - - if (!seen.contains(id)) { - seen.add(id); - queue.add(doc); - } - } - }); - - return queue; - } - - private Queue simplifyQueue(final Queue queue, final String ngram, - final Reporter context) { - final Queue q = new LinkedList<>(); - - String fieldRef = ""; - final List tempResults = Lists.newArrayList(); - - while (!queue.isEmpty()) { - final MapDocument result = queue.remove(); - - final String orderFieldName = dedupConf.getWf().getOrderField(); - final Field orderFieldValue = result.values(orderFieldName); - if (!orderFieldValue.isEmpty()) { - final String field = NGramUtils.cleanupForOrdering(orderFieldValue.stringValue()); - if (field.equals(fieldRef)) { - tempResults.add(result); - } else { - populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram); - tempResults.clear(); - tempResults.add(result); - fieldRef = field; - } - } else { - context - .incrementCounter( - dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField(), 1); - } - } - populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram); - - return q; - } - - private void populateSimplifiedQueue(final Queue q, - final List tempResults, - final Reporter context, - final String fieldRef, - final String ngram) { - WfConfig wf = dedupConf.getWf(); - if (tempResults.size() < wf.getGroupMaxSize()) { - q.addAll(tempResults); - } else { - context - .incrementCounter( - wf.getEntityType(), - String.format("Skipped records for count(%s) >= %s", wf.getOrderField(), wf.getGroupMaxSize()), - tempResults.size()); -// log.info("Skipped field: " + fieldRef + " - size: " + tempResults.size() + " - ngram: " + ngram); - } - } - - private void process(final Queue queue, final Reporter context) { - - while (!queue.isEmpty()) { - - final MapDocument pivot = queue.remove(); - final String idPivot = pivot.getIdentifier(); - - WfConfig wf = dedupConf.getWf(); - final Field fieldsPivot = pivot.values(wf.getOrderField()); - final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? "" : fieldsPivot.stringValue(); + final String idPivot = pivot.getString(identifierFieldPos); // identifier + final Object fieldsPivot = getJavaValue(pivot, orderFieldPos); + final String fieldPivot = (fieldsPivot == null) ? "" : fieldsPivot.toString(); + final WfConfig wf = dedupConf.getWf(); if (fieldPivot != null) { int i = 0; - for (final MapDocument curr : queue) { - final String idCurr = curr.getIdentifier(); + for (int windowPos = pivotPos + 1; windowPos < queue.size(); windowPos++) { + final Row curr = queue.get(windowPos); + final String idCurr = curr.getString(identifierFieldPos); // identifier if (mustSkip(idCurr)) { - context.incrementCounter(wf.getEntityType(), "skip list", 1); - break; } - if (i > wf.getSlidingWindowSize()) { + if (++i > wf.getSlidingWindowSize()) { break; } - final Field fieldsCurr = curr.values(wf.getOrderField()); - final String fieldCurr = (fieldsCurr == null) || fieldsCurr.isEmpty() ? null - : fieldsCurr.stringValue(); + final Object fieldsCurr = getJavaValue(curr, orderFieldPos); + final String fieldCurr = (fieldsCurr == null) ? null : fieldsCurr.toString(); if (!idCurr.equals(idPivot) && (fieldCurr != null)) { final TreeProcessor treeProcessor = new TreeProcessor(dedupConf); emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context); - } } } } } + public Object getJavaValue(Row row, int pos) { + DataType dt = row.schema().fields()[pos].dataType(); + if (dt instanceof StringType) { + return row.getString(pos); + } else if (dt instanceof ArrayType) { + return row.getList(pos); + } + + return null; + } + private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) { if (result) { - writeSimilarity(context, idPivot, idCurr); + if (idPivot.compareTo(idCurr) <= 0) { + writeSimilarity(context, idPivot, idCurr); + } else { + writeSimilarity(context, idCurr, idPivot); + } context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1); } else { context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1); @@ -211,7 +141,6 @@ public class BlockProcessor { final String type = dedupConf.getWf().getEntityType(); context.emit(type, from, to); - context.emit(type, to, from); } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java deleted file mode 100644 index 40f502e11..000000000 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java +++ /dev/null @@ -1,276 +0,0 @@ - -package eu.dnetlib.pace.util; - -import java.util.*; - -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import com.google.common.collect.Lists; - -import eu.dnetlib.pace.clustering.NGramUtils; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.config.WfConfig; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.model.MapDocumentComparator; -import eu.dnetlib.pace.tree.*; -import eu.dnetlib.pace.tree.support.TreeProcessor; - -public class BlockProcessorForTesting { - - public static final List accumulators = new ArrayList<>(); - - private static final Log log = LogFactory.getLog(eu.dnetlib.pace.util.BlockProcessorForTesting.class); - - private DedupConfig dedupConf; - - public static void constructAccumulator(final DedupConfig dedupConf) { - accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1")); - accumulators - .add( - String - .format( - "%s::%s", dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField())); - accumulators - .add( - String - .format( - "%s::%s", dedupConf.getWf().getEntityType(), - String - .format( - "Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), - dedupConf.getWf().getGroupMaxSize()))); - accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list")); - accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)")); - accumulators - .add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold())); - } - - public BlockProcessorForTesting(DedupConfig dedupConf) { - this.dedupConf = dedupConf; - } - - public void processSortedBlock(final String key, final List documents, final Reporter context, - boolean useTree, boolean noMatch) { - if (documents.size() > 1) { -// log.info("reducing key: '" + key + "' records: " + q.size()); - process(prepare(documents), context, useTree, noMatch); - - } else { - context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); - } - } - - public void process(final String key, final Iterable documents, final Reporter context, - boolean useTree, boolean noMatch) { - - final Queue q = prepare(documents); - - if (q.size() > 1) { -// log.info("reducing key: '" + key + "' records: " + q.size()); - process(simplifyQueue(q, key, context), context, useTree, noMatch); - - } else { - context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); - } - } - - private Queue prepare(final Iterable documents) { - final Queue queue = new PriorityQueue<>(100, - new MapDocumentComparator(dedupConf.getWf().getOrderField())); - - final Set seen = new HashSet(); - final int queueMaxSize = dedupConf.getWf().getQueueMaxSize(); - - documents.forEach(doc -> { - if (queue.size() <= queueMaxSize) { - final String id = doc.getIdentifier(); - - if (!seen.contains(id)) { - seen.add(id); - queue.add(doc); - } - } - }); - - return queue; - } - - private Queue simplifyQueue(final Queue queue, final String ngram, - final Reporter context) { - final Queue q = new LinkedList<>(); - - String fieldRef = ""; - final List tempResults = Lists.newArrayList(); - - while (!queue.isEmpty()) { - final MapDocument result = queue.remove(); - - final String orderFieldName = dedupConf.getWf().getOrderField(); - final Field orderFieldValue = result.values(orderFieldName); - if (!orderFieldValue.isEmpty()) { - final String field = NGramUtils.cleanupForOrdering(orderFieldValue.stringValue()); - if (field.equals(fieldRef)) { - tempResults.add(result); - } else { - populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram); - tempResults.clear(); - tempResults.add(result); - fieldRef = field; - } - } else { - context - .incrementCounter( - dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField(), 1); - } - } - populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram); - - return q; - } - - private void populateSimplifiedQueue(final Queue q, - final List tempResults, - final Reporter context, - final String fieldRef, - final String ngram) { - WfConfig wf = dedupConf.getWf(); - if (tempResults.size() < wf.getGroupMaxSize()) { - q.addAll(tempResults); - } else { - context - .incrementCounter( - wf.getEntityType(), - String.format("Skipped records for count(%s) >= %s", wf.getOrderField(), wf.getGroupMaxSize()), - tempResults.size()); -// log.info("Skipped field: " + fieldRef + " - size: " + tempResults.size() + " - ngram: " + ngram); - } - } - - private void process(final Queue queue, final Reporter context, boolean useTree, boolean noMatch) { - - while (!queue.isEmpty()) { - - final MapDocument pivot = queue.remove(); - final String idPivot = pivot.getIdentifier(); - - WfConfig wf = dedupConf.getWf(); - final Field fieldsPivot = pivot.values(wf.getOrderField()); - final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? "" : fieldsPivot.stringValue(); - - if (fieldPivot != null) { - int i = 0; - for (final MapDocument curr : queue) { - final String idCurr = curr.getIdentifier(); - - if (mustSkip(idCurr)) { - - context.incrementCounter(wf.getEntityType(), "skip list", 1); - - break; - } - - if (i > wf.getSlidingWindowSize()) { - break; - } - - final Field fieldsCurr = curr.values(wf.getOrderField()); - final String fieldCurr = (fieldsCurr == null) || fieldsCurr.isEmpty() ? null - : fieldsCurr.stringValue(); - - if (!idCurr.equals(idPivot) && (fieldCurr != null)) { - - // draws no match relations (test purpose) - if (noMatch) { - emitOutput(!new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context); - } else { - // use the decision tree implementation or the "normal" implementation of the similarity - // score (valid only for publications) - if (useTree) - emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context); - else - emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context); - } -// if(new TreeProcessor(dedupConf).compare(pivot, curr) != publicationCompare(pivot, curr, dedupConf)) { -// emitOutput(true, idPivot, idCurr, context); -// } - - } - } - } - } - } - - protected static boolean compareInstanceType(MapDocument a, MapDocument b, DedupConfig conf) { - Map params = new HashMap<>(); - InstanceTypeMatch instanceTypeMatch = new InstanceTypeMatch(params); - double compare = instanceTypeMatch - .compare(a.getFieldMap().get("instance"), b.getFieldMap().get("instance"), conf); - return compare >= 1.0; - } - - private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) { - // if the score gives 1, the publications are equivalent - Map params = new HashMap<>(); - params.put("jpath_value", "$.value"); - params.put("jpath_classid", "$.qualifier.classid"); - params.put("mode", "count"); - - double score = 0.0; - - // levenstein title - LevensteinTitle levensteinTitle = new LevensteinTitle(params); - if (levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config) >= 0.9) { - score += 0.2; - } - - // pid - JsonListMatch jsonListMatch = new JsonListMatch(params); - if (jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config) >= 1.0) { - score += 0.5; - } - - // title version - TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params); - double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config); - if (result1 < 0 || result1 >= 1.0) { - score += 0.1; - } - - // authors match - params.remove("mode"); - AuthorsMatch authorsMatch = new AuthorsMatch(params); - double result2 = authorsMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config); - if (result2 < 0 || result2 >= 0.6) { - score += 0.2; - } - - return score >= 0.5; - } - - private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) { - - if (result) { - writeSimilarity(context, idPivot, idCurr); - context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1); - } else { - context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1); - } - } - - private boolean mustSkip(final String idPivot) { - return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot)); - } - - private String getNsPrefix(final String id) { - return StringUtils.substringBetween(id, "|", "::"); - } - - private void writeSimilarity(final Reporter context, final String from, final String to) { - final String type = dedupConf.getWf().getEntityType(); - - context.emit(type, from, to); - } -} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java index 12c96500e..c885f2aeb 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java @@ -19,23 +19,6 @@ package eu.dnetlib.pace.util; * limitations under the License. */ -/* - * Diff Match and Patch - * Copyright 2018 The diff-match-patch Authors. - * https://github.com/google/diff-match-patch - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.net.URLEncoder; diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java index 2c1a1700b..a59b6248b 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java @@ -2,19 +2,20 @@ package eu.dnetlib.pace.util; import java.math.BigDecimal; -import java.util.*; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.concurrent.ConcurrentHashMap; import java.util.function.Predicate; -import java.util.stream.Collectors; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; -import com.jayway.jsonpath.Configuration; +import com.jayway.jsonpath.DocumentContext; import com.jayway.jsonpath.JsonPath; -import com.jayway.jsonpath.Option; +import com.jayway.jsonpath.spi.cache.Cache; +import com.jayway.jsonpath.spi.cache.CacheProvider; -import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.Type; -import eu.dnetlib.pace.model.*; import net.minidev.json.JSONArray; public class MapDocumentUtil { @@ -22,103 +23,20 @@ public class MapDocumentUtil { public static final String URL_REGEX = "^(http|https|ftp)\\://.*"; public static Predicate urlFilter = s -> s.trim().matches(URL_REGEX); - public static MapDocument asMapDocumentWithJPath(DedupConfig conf, final String json) { - MapDocument m = new MapDocument(); - m.setIdentifier(getJPathString(conf.getWf().getIdPath(), json)); - Map stringField = new HashMap<>(); - conf.getPace().getModel().forEach(fdef -> { - switch (fdef.getType()) { - case String: - case Int: - stringField - .put( - fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), - truncateValue(getJPathString(fdef.getPath(), json), fdef.getLength()))); - break; - case URL: - String uv = getJPathString(fdef.getPath(), json); - if (!urlFilter.test(uv)) - uv = ""; - stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), uv)); - break; - case List: - case JSON: - FieldListImpl fi = new FieldListImpl(fdef.getName(), fdef.getType()); - truncateList(getJPathList(fdef.getPath(), json, fdef.getType()), fdef.getSize()) - .stream() - .map(item -> new FieldValueImpl(Type.String, fdef.getName(), item)) - .forEach(fi::add); - stringField.put(fdef.getName(), fi); - break; - case DoubleArray: - stringField - .put( - fdef.getName(), - new FieldValueImpl(Type.DoubleArray, - fdef.getName(), - getJPathArray(fdef.getPath(), json))); - break; - case StringConcat: - String[] jpaths = fdef.getPath().split("\\|\\|\\|"); - stringField - .put( - fdef.getName(), - new FieldValueImpl(Type.String, - fdef.getName(), - truncateValue( - Arrays - .stream(jpaths) - .map(jpath -> getJPathString(jpath, json)) - .collect(Collectors.joining(" ")), - fdef.getLength()))); - break; + static { + CacheProvider.setCache(new Cache() { + private final ConcurrentHashMap jsonPathCache = new ConcurrentHashMap(); + + @Override + public JsonPath get(String key) { + return jsonPathCache.get(key); + } + + @Override + public void put(String key, JsonPath value) { + jsonPathCache.put(key, value); } }); - m.setFieldMap(stringField); - return m; - } - - public static List getJPathList(String path, String json, Type type) { - if (type == Type.List) - return JsonPath - .using( - Configuration - .defaultConfiguration() - .addOptions(Option.ALWAYS_RETURN_LIST, Option.SUPPRESS_EXCEPTIONS)) - .parse(json) - .read(path); - Object jresult; - List result = new ArrayList<>(); - try { - jresult = JsonPath.read(json, path); - } catch (Throwable e) { - return result; - } - if (jresult instanceof JSONArray) { - - ((JSONArray) jresult).forEach(it -> { - - try { - result.add(new ObjectMapper().writeValueAsString(it)); - } catch (JsonProcessingException e) { - - } - }); - return result; - } - - if (jresult instanceof LinkedHashMap) { - try { - result.add(new ObjectMapper().writeValueAsString(jresult)); - } catch (JsonProcessingException e) { - - } - return result; - } - if (jresult instanceof String) { - result.add((String) jresult); - } - return result; } public static String getJPathString(final String jsonPath, final String json) { @@ -174,4 +92,54 @@ public class MapDocumentUtil { return list.subList(0, size); } + public static String getJPathString(final String jsonPath, final DocumentContext json) { + try { + Object o = json.read(jsonPath); + if (o instanceof String) + return (String) o; + if (o instanceof JSONArray && ((JSONArray) o).size() > 0) + return (String) ((JSONArray) o).get(0); + return ""; + } catch (Exception e) { + return ""; + } + } + + public static List getJPathList(String path, DocumentContext json, Type type) { + // if (type == Type.List) + // return JsonPath.using(Configuration.defaultConfiguration().addOptions(Option.ALWAYS_RETURN_LIST, + // Option.SUPPRESS_EXCEPTIONS)).parse(json).read(path); + Object jresult; + List result = new ArrayList<>(); + try { + jresult = json.read(path); + } catch (Throwable e) { + return result; + } + + if (jresult instanceof JSONArray) { + ((JSONArray) jresult).forEach(it -> { + try { + result.add(new ObjectMapper().writeValueAsString(it)); + } catch (JsonProcessingException e) { + + } + }); + return result; + } + + if (jresult instanceof LinkedHashMap) { + try { + result.add(new ObjectMapper().writeValueAsString(jresult)); + } catch (JsonProcessingException e) { + + } + return result; + } + if (jresult instanceof String) { + result.add((String) jresult); + } + return result; + } + } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/SparkReporter.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/SparkReporter.java new file mode 100644 index 000000000..437fe783b --- /dev/null +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/SparkReporter.java @@ -0,0 +1,85 @@ + +package eu.dnetlib.pace.util; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.spark.SparkContext; +import org.apache.spark.util.LongAccumulator; + +import eu.dnetlib.pace.config.DedupConfig; +import scala.Serializable; +import scala.Tuple2; + +public class SparkReporter implements Serializable, Reporter { + + private final List> relations = new ArrayList<>(); + + private final Map accumulators; + + public SparkReporter(Map accumulators) { + this.accumulators = accumulators; + } + + public void incrementCounter( + String counterGroup, + String counterName, + long delta, + Map accumulators) { + + final String accumulatorName = String.format("%s::%s", counterGroup, counterName); + if (accumulators.containsKey(accumulatorName)) { + accumulators.get(accumulatorName).add(delta); + } + } + + @Override + public void incrementCounter(String counterGroup, String counterName, long delta) { + + incrementCounter(counterGroup, counterName, delta, accumulators); + } + + @Override + public void emit(String type, String from, String to) { + relations.add(new Tuple2<>(from, to)); + } + + public List> getRelations() { + return relations; + } + + public static Map constructAccumulator( + final DedupConfig dedupConf, final SparkContext context) { + + Map accumulators = new HashMap<>(); + + String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"); + accumulators.put(acc1, context.longAccumulator(acc1)); + String acc2 = String + .format( + "%s::%s", + dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()); + accumulators.put(acc2, context.longAccumulator(acc2)); + String acc3 = String + .format( + "%s::%s", + dedupConf.getWf().getEntityType(), + String + .format( + "Skipped records for count(%s) >= %s", + dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize())); + accumulators.put(acc3, context.longAccumulator(acc3)); + String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list"); + accumulators.put(acc4, context.longAccumulator(acc4)); + String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"); + accumulators.put(acc5, context.longAccumulator(acc5)); + String acc6 = String + .format( + "%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()); + accumulators.put(acc6, context.longAccumulator(acc6)); + + return accumulators; + } +} diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java index 2a37701aa..d3f502f35 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java @@ -3,57 +3,42 @@ package eu.dnetlib.pace; import java.io.IOException; import java.io.StringWriter; -import java.nio.charset.StandardCharsets; import java.util.List; -import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import eu.dnetlib.pace.common.AbstractPaceFunctions; -import eu.dnetlib.pace.config.Type; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldListImpl; -import eu.dnetlib.pace.model.FieldValueImpl; public abstract class AbstractPaceTest extends AbstractPaceFunctions { protected String readFromClasspath(final String filename) { final StringWriter sw = new StringWriter(); try { - IOUtils.copy(getClass().getResourceAsStream(filename), sw, StandardCharsets.UTF_8); + IOUtils.copy(getClass().getResourceAsStream(filename), sw); return sw.toString(); } catch (final IOException e) { throw new RuntimeException("cannot load resource from classpath: " + filename); } } - protected Field title(final String s) { - return new FieldValueImpl(Type.String, "title", s); + protected String title(final String s) { + return s; } - protected Field person(final String s) { - return new FieldValueImpl(Type.JSON, "person", s); + protected String person(final String s) { + return s; } - protected Field url(final String s) { - return new FieldValueImpl(Type.URL, "url", s); + protected String url(final String s) { + return s; } - protected Field array(final double[] a) { - return new FieldValueImpl(Type.DoubleArray, "array", a); - } - - protected Field createFieldList(List strings, String fieldName) { - - List fieldValueStream = strings - .stream() - .map(s -> new FieldValueImpl(Type.String, fieldName, s)) - .collect(Collectors.toList()); - - FieldListImpl a = new FieldListImpl(); - a.addAll(fieldValueStream); - + protected double[] array(final double[] a) { return a; + } + + protected List createFieldList(List strings, String fieldName) { + return strings; } } diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index 9873278b9..f9a1ea9e2 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -2,14 +2,12 @@ package eu.dnetlib.pace.clustering; import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; -import org.junit.jupiter.api.*; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import com.google.common.collect.Lists; import com.google.common.collect.Maps; -import com.google.common.collect.Sets; import eu.dnetlib.pace.AbstractPaceTest; import eu.dnetlib.pace.common.AbstractPaceFunctions; @@ -37,7 +35,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "http://www.test.it/path/to/resource"; System.out.println(s); - System.out.println(urlClustering.apply(conf, Lists.newArrayList(url(s)))); + System.out.println(urlClustering.apply(conf, Lists.newArrayList(s))); } @Test @@ -51,7 +49,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "Search for the Standard Model Higgs Boson"; System.out.println(s); - System.out.println(ngram.apply(conf, Lists.newArrayList(title(s)))); + System.out.println(ngram.apply(conf, Lists.newArrayList(s))); } @Test @@ -63,7 +61,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "Search for the Standard Model Higgs Boson"; System.out.println(s); - System.out.println(np.apply(conf, Lists.newArrayList(title(s)))); + System.out.println(np.apply(conf, Lists.newArrayList(s))); } @Test @@ -75,15 +73,15 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s1 = "University of Pisa"; System.out.println(s1); - System.out.println(np.apply(conf, Lists.newArrayList(title(s1)))); + System.out.println(np.apply(conf, Lists.newArrayList(s1))); final String s2 = "Pisa University"; System.out.println(s2); - System.out.println(np.apply(conf, Lists.newArrayList(title(s2)))); + System.out.println(np.apply(conf, Lists.newArrayList(s2))); final String s3 = "Parco Tecnologico Agroalimentare Umbria"; System.out.println(s3); - System.out.println(np.apply(conf, Lists.newArrayList(title(s3)))); + System.out.println(np.apply(conf, Lists.newArrayList(s3))); } @@ -97,7 +95,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "Search for the Standard Model Higgs Boson"; System.out.println(s); - System.out.println(acro.apply(conf, Lists.newArrayList(title(s)))); + System.out.println(acro.apply(conf, Lists.newArrayList(s))); } @Test @@ -109,12 +107,12 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "Search for the Standard Model Higgs Boson"; System.out.println(s); - System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); + System.out.println(sp.apply(conf, Lists.newArrayList(s))); params.put("len", 3); params.put("max", 1); - System.out.println(sp.apply(conf, Lists.newArrayList(title("Framework for general-purpose deduplication")))); + System.out.println(sp.apply(conf, Lists.newArrayList("Framework for general-purpose deduplication"))); } @Test @@ -127,7 +125,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "Search for the Standard Model Higgs Boson"; System.out.println(s); - System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); + System.out.println(sp.apply(conf, Lists.newArrayList(s))); } @Test @@ -138,31 +136,31 @@ public class ClusteringFunctionTest extends AbstractPaceTest { String s = "Search for the Standard Model Higgs Boson"; System.out.println(s); - System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); + System.out.println(sp.apply(conf, Lists.newArrayList(s))); s = "A Physical Education Teacher Is Like...: Examining Turkish Students Perceptions of Physical Education Teachers Through Metaphor Analysis"; System.out.println(s); - System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); + System.out.println(sp.apply(conf, Lists.newArrayList(s))); s = "Structure of a Eukaryotic Nonribosomal Peptide Synthetase Adenylation Domain That Activates a Large Hydroxamate Amino Acid in Siderophore Biosynthesis"; System.out.println(s); - System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); + System.out.println(sp.apply(conf, Lists.newArrayList(s))); s = "Performance Evaluation"; System.out.println(s); - System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); + System.out.println(sp.apply(conf, Lists.newArrayList(s))); s = "JRC Open Power Plants Database (JRC-PPDB-OPEN)"; System.out.println(s); - System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); + System.out.println(sp.apply(conf, Lists.newArrayList(s))); s = "JRC Open Power Plants Database"; System.out.println(s); - System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); + System.out.println(sp.apply(conf, Lists.newArrayList(s))); s = "niivue/niivue: 0.21.1"; System.out.println(s); - System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); + System.out.println(sp.apply(conf, Lists.newArrayList(s))); } @@ -175,7 +173,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "Search for the Standard Model Higgs Boson"; System.out.println(s); - System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); + System.out.println(sp.apply(conf, Lists.newArrayList(s))); } @Test @@ -184,35 +182,35 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final ClusteringFunction cf = new KeywordsClustering(params); final String s = "Polytechnic University of Turin"; System.out.println(s); - System.out.println(cf.apply(conf, Lists.newArrayList(title(s)))); + System.out.println(cf.apply(conf, Lists.newArrayList(s))); final String s1 = "POLITECNICO DI TORINO"; System.out.println(s1); - System.out.println(cf.apply(conf, Lists.newArrayList(title(s1)))); + System.out.println(cf.apply(conf, Lists.newArrayList(s1))); final String s2 = "Universita farmaceutica culturale di milano bergamo"; System.out.println("s2 = " + s2); - System.out.println(cf.apply(conf, Lists.newArrayList(title(s2)))); + System.out.println(cf.apply(conf, Lists.newArrayList(s2))); final String s3 = "universita universita milano milano"; System.out.println("s3 = " + s3); - System.out.println(cf.apply(conf, Lists.newArrayList(title(s3)))); + System.out.println(cf.apply(conf, Lists.newArrayList(s3))); final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)"; System.out.println("s4 = " + s4); - System.out.println(cf.apply(conf, Lists.newArrayList(title(s4)))); + System.out.println(cf.apply(conf, Lists.newArrayList(s4))); final String s5 = "İstanbul Ticarət Universiteti"; System.out.println("s5 = " + s5); - System.out.println(cf.apply(conf, Lists.newArrayList(title(s5)))); + System.out.println(cf.apply(conf, Lists.newArrayList(s5))); final String s6 = "National and Kapodistrian University of Athens"; System.out.println("s6 = " + s6); - System.out.println(cf.apply(conf, Lists.newArrayList(title(s6)))); + System.out.println(cf.apply(conf, Lists.newArrayList(s6))); final String s7 = "Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών"; System.out.println("s7 = " + s7); - System.out.println(cf.apply(conf, Lists.newArrayList(title(s7)))); + System.out.println(cf.apply(conf, Lists.newArrayList(s7))); } @@ -222,11 +220,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final ClusteringFunction cf = new PersonClustering(params); final String s = "Abd-Alla, Abo-el-nour N."; System.out.println("s = " + s); - System.out.println(cf.apply(conf, Lists.newArrayList(title(s)))); + System.out.println(cf.apply(conf, Lists.newArrayList(s))); final String s1 = "Manghi, Paolo"; System.out.println("s1 = " + s1); - System.out.println(cf.apply(conf, Lists.newArrayList(title(s1)))); + System.out.println(cf.apply(conf, Lists.newArrayList(s1))); } @@ -236,11 +234,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final ClusteringFunction cf = new PersonHash(params); final String s = "Manghi, Paolo"; System.out.println("s = " + s); - System.out.println(cf.apply(conf, Lists.newArrayList(title(s)))); + System.out.println(cf.apply(conf, Lists.newArrayList(s))); final String s1 = "Manghi, P."; System.out.println("s = " + s1); - System.out.println(cf.apply(conf, Lists.newArrayList(title(s1)))); + System.out.println(cf.apply(conf, Lists.newArrayList(s1))); } @@ -250,7 +248,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final ClusteringFunction cf = new LastNameFirstInitial(params); final String s = "LI Yonghong"; System.out.println("s = " + s); - System.out.println(cf.apply(conf, Lists.newArrayList(title(s)))); + System.out.println(cf.apply(conf, Lists.newArrayList(s))); } } diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index 5c846c058..b37e16cf5 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -3,19 +3,16 @@ package eu.dnetlib.pace.comparators; import static org.junit.jupiter.api.Assertions.assertEquals; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; +import java.util.*; -import org.junit.jupiter.api.*; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInstance; import eu.dnetlib.pace.AbstractPaceTest; import eu.dnetlib.pace.clustering.NGramUtils; import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.config.Type; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldValueImpl; import eu.dnetlib.pace.tree.*; @TestInstance(TestInstance.Lifecycle.PER_CLASS) @@ -99,8 +96,8 @@ public class ComparatorTest extends AbstractPaceTest { @Test public void listContainsMatchTest() { - Field a = createFieldList(Arrays.asList("Article", "Publication", "ORP"), "instanceType"); - Field b = createFieldList(Arrays.asList("Publication", "Article", "ORP"), "instanceType"); + List a = createFieldList(Arrays.asList("Article", "Publication", "ORP"), "instanceType"); + List b = createFieldList(Arrays.asList("Publication", "Article", "ORP"), "instanceType"); params.put("string", "Article"); params.put("bool", "XOR"); @@ -214,31 +211,32 @@ public class ComparatorTest extends AbstractPaceTest { final InstanceTypeMatch instanceTypeMatch = new InstanceTypeMatch(params); - Field a = createFieldList(Arrays.asList("Article", "Article", "Article"), "instanceType"); - Field b = createFieldList(Arrays.asList("Article", "Article", "Article"), "instanceType"); + List a = createFieldList(Arrays.asList("Article", "Article", "Article"), "instanceType"); + List b = createFieldList(Arrays.asList("Article", "Article", "Article"), "instanceType"); double result = instanceTypeMatch.compare(a, b, conf); assertEquals(1.0, result); - Field c = createFieldList( + List c = createFieldList( Arrays.asList("Conference object", "Conference object", "Conference object"), "instanceType"); result = instanceTypeMatch.compare(c, b, conf); assertEquals(1.0, result); - Field d = createFieldList(Arrays.asList("Master thesis", "Master thesis", "Master thesis"), "instanceType"); - Field e = createFieldList( + List d = createFieldList( + Arrays.asList("Master thesis", "Master thesis", "Master thesis"), "instanceType"); + List e = createFieldList( Arrays.asList("Bachelor thesis", "Bachelor thesis", "Bachelor thesis"), "instanceType"); result = instanceTypeMatch.compare(d, e, conf); assertEquals(1.0, result); - Field g = createFieldList(Arrays.asList("Software Paper", "Software Paper"), "instanceType"); + List g = createFieldList(Arrays.asList("Software Paper", "Software Paper"), "instanceType"); result = instanceTypeMatch.compare(e, g, conf); assertEquals(0.0, result); - Field h = createFieldList(Arrays.asList("Other literature type", "Article"), "instanceType"); + List h = createFieldList(Arrays.asList("Other literature type", "Article"), "instanceType"); result = instanceTypeMatch.compare(a, h, conf); assertEquals(1.0, result); @@ -249,15 +247,15 @@ public class ComparatorTest extends AbstractPaceTest { AuthorsMatch authorsMatch = new AuthorsMatch(params); - Field a = createFieldList( + List a = createFieldList( Arrays.asList("La Bruzzo, Sandro", "Atzori, Claudio", "De Bonis, Michele"), "authors"); - Field b = createFieldList(Arrays.asList("Atzori, C.", "La Bruzzo, S.", "De Bonis, M."), "authors"); + List b = createFieldList(Arrays.asList("Atzori, C.", "La Bruzzo, S.", "De Bonis, M."), "authors"); double result = authorsMatch.compare(a, b, conf); assertEquals(1.0, result); - Field c = createFieldList(Arrays.asList("Manghi, Paolo"), "authors"); - Field d = createFieldList(Arrays.asList("Manghi, Pasquale"), "authors"); + List c = createFieldList(Arrays.asList("Manghi, Paolo"), "authors"); + List d = createFieldList(Arrays.asList("Manghi, Pasquale"), "authors"); result = authorsMatch.compare(c, d, conf); assertEquals(0.0, result); @@ -268,12 +266,12 @@ public class ComparatorTest extends AbstractPaceTest { assertEquals(1.0, result); - Field e = createFieldList(Arrays.asList("Manghi, Paolo", "Atzori, Claudio"), "authors"); + List e = createFieldList(Arrays.asList("Manghi, Paolo", "Atzori, Claudio"), "authors"); result = authorsMatch.compare(a, e, conf); assertEquals(0.25, result); - Field f = createFieldList(new ArrayList<>(), "authors"); + List f = createFieldList(new ArrayList<>(), "authors"); result = authorsMatch.compare(f, f, conf); System.out.println("result = " + result); @@ -284,12 +282,12 @@ public class ComparatorTest extends AbstractPaceTest { JsonListMatch jsonListMatch = new JsonListMatch(params); - Field a = createFieldList( + List a = createFieldList( Arrays .asList( "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}"), "authors"); - Field b = createFieldList( + List b = createFieldList( Arrays .asList( "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmc\",\"classname\":\"PubMed Central ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"PMC5399005\"}", @@ -313,8 +311,8 @@ public class ComparatorTest extends AbstractPaceTest { public void domainExactMatch() { DomainExactMatch domainExactMatch = new DomainExactMatch(params); - Field a = url("http://www.flowrepository.org"); - Field b = url("http://flowrepository.org/"); + String a = url("http://www.flowrepository.org"); + String b = url("http://flowrepository.org/"); double compare = domainExactMatch.compare(a, b, conf); System.out.println("compare = " + compare); @@ -326,12 +324,12 @@ public class ComparatorTest extends AbstractPaceTest { CosineSimilarity cosineSimilarity = new CosineSimilarity(params); - Field a = new FieldValueImpl(Type.DoubleArray, "array", new double[] { + double[] a = new double[] { 1, 2, 3 - }); - Field b = new FieldValueImpl(Type.DoubleArray, "array", new double[] { + }; + double[] b = new double[] { 1, 2, 3 - }); + }; double compare = cosineSimilarity.compare(a, b, conf); diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java index 02b59354a..b46085bb4 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -3,26 +3,14 @@ package eu.dnetlib.pace.config; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertTrue; -import java.util.*; -import java.util.stream.Collectors; +import java.util.HashMap; +import java.util.Map; -import org.junit.jupiter.api.*; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import eu.dnetlib.pace.AbstractPaceTest; -import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; -import eu.dnetlib.pace.clustering.ClusteringClass; -import eu.dnetlib.pace.clustering.ClusteringCombiner; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldList; -import eu.dnetlib.pace.model.FieldValue; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.tree.JsonListMatch; -import eu.dnetlib.pace.tree.support.AggType; -import eu.dnetlib.pace.tree.support.FieldConf; -import eu.dnetlib.pace.tree.support.TreeNodeDef; -import eu.dnetlib.pace.tree.support.TreeNodeStats; import eu.dnetlib.pace.util.MapDocumentUtil; public class ConfigTest extends AbstractPaceTest { @@ -82,41 +70,6 @@ public class ConfigTest extends AbstractPaceTest { assertEquals(0, load.getPace().translationMap().keySet().size()); } - @Test - public void asMapDocumentTest1() { - - DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json")); - - final String json = readFromClasspath("publication.json"); - - final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json); - -// System.out.println("mapDocument = " + mapDocument.getFieldMap()); - -// JsonListMatch jsonListMatch = new JsonListMatch(params); -// -// jsonListMatch.compare(mapDocument.getFieldMap().get("pid"), mapDocument.getFieldMap().get("pid"), null); - - System.out.println("mapDocument = " + mapDocument.getFieldMap().get("title").stringValue()); - - } - - @Test - public void authorAsMapDocument() { - - DedupConfig dedupConf = DedupConfig.load(readFromClasspath("author.fdup.conf.json")); - - final String json = readFromClasspath("author.json"); - - final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json); - - System.out - .println( - "mapDocument = " - + Arrays.toString(((FieldValue) mapDocument.getFieldMap().get("topics")).doubleArrayValue())); - - } - @Test public void testJPath() { final String json = readFromClasspath("organization.json"); @@ -126,53 +79,4 @@ public class ConfigTest extends AbstractPaceTest { System.out.println("result = " + MapDocumentUtil.getJPathString(jpath, json)); } - @Test - public void clusteringCombinerTest() { - - DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json")); - - final String json = readFromClasspath("publication.json"); - - final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json); - - String[] combine = ClusteringCombiner.combine(mapDocument, dedupConf).toArray(new String[3]); - - assertEquals("test", combine[0].split(":")[1]); - assertEquals("title", combine[1].split(":")[1]); - assertEquals("doi", combine[2].split(":")[1]); - } - - @Test - public void filterAndCombineTest() { - - DedupConfig dedupConf = DedupConfig.load(readFromClasspath("pub.prod.conf.json")); - - final String json = readFromClasspath("publication.example.json"); - - final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json); - - Collection strings = BlacklistAwareClusteringCombiner.filterAndCombine(mapDocument, dedupConf); - - for (String s : strings) { - System.out.println("s = " + s); - } - - } - - @Test - public void crossCompareTest() { - - DedupConfig dedupConf = DedupConfig.load(readFromClasspath("organization.cross.compare.conf.json")); - - TreeNodeDef treeNode = dedupConf.decisionTree().get("start"); - - final String json = readFromClasspath("organization.json"); - - final MapDocument doc = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json); - - TreeNodeStats nodeStats = treeNode.evaluate(doc, doc, dedupConf); - - assertTrue(nodeStats.getFinalScore(AggType.MAX) > 0.7); - - } } diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java index 41b24d8be..6056c342d 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java @@ -6,9 +6,11 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import java.util.HashMap; import java.util.Map; -import org.junit.jupiter.api.*; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import eu.dnetlib.pace.model.Person; +import jdk.nashorn.internal.ir.annotations.Ignore; public class UtilTest { @@ -20,6 +22,7 @@ public class UtilTest { } @Test + @Ignore public void paceResolverTest() { PaceResolver paceResolver = new PaceResolver(); paceResolver.getComparator("keywordMatch", params); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java index 139f7e74a..e507f8c56 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java @@ -7,8 +7,8 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.List; -import org.apache.commons.lang.StringUtils; -import org.apache.commons.lang.reflect.FieldUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.reflect.FieldUtils; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.ss.usermodel.Cell; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadProjects.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadProjects.java index 904837e3d..f652b3dba 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadProjects.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadProjects.java @@ -1,12 +1,13 @@ package eu.dnetlib.dhp.actionmanager.project.utils; -import java.io.*; +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.Serializable; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; -import java.util.zip.ZipEntry; -import java.util.zip.ZipInputStream; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; @@ -66,7 +67,7 @@ public class ReadProjects implements Serializable { FSDataInputStream inputStream = fs.open(hdfsreadpath); - ArrayList projects = OBJECT_MAPPER + List projects = OBJECT_MAPPER .readValue( IOUtils.toString(inputStream, "UTF-8"), new TypeReference>() { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadTopics.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadTopics.java index e0e34be31..3b0603c6b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadTopics.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadTopics.java @@ -6,7 +6,6 @@ import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Serializable; import java.nio.charset.StandardCharsets; -import java.util.ArrayList; import java.util.List; import org.apache.commons.io.IOUtils; @@ -23,7 +22,6 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.actionmanager.project.PrepareProjects; import eu.dnetlib.dhp.actionmanager.project.utils.model.JsonTopic; -import eu.dnetlib.dhp.actionmanager.project.utils.model.Project; import eu.dnetlib.dhp.application.ArgumentApplicationParser; /** @@ -68,7 +66,7 @@ public class ReadTopics implements Serializable { FSDataInputStream inputStream = fs.open(hdfsreadpath); - ArrayList topics = OBJECT_MAPPER + List topics = OBJECT_MAPPER .readValue( IOUtils.toString(inputStream, "UTF-8"), new TypeReference>() { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java index 28b2572fb..8f4a9e393 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java @@ -9,7 +9,7 @@ import java.util.Iterator; import java.util.Queue; import java.util.concurrent.PriorityBlockingQueue; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.DocumentHelper; diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/CheckDuplictedIdsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/CheckDuplictedIdsJob.java index f0aa6491f..bb3a17ac4 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/CheckDuplictedIdsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/CheckDuplictedIdsJob.java @@ -2,7 +2,7 @@ package eu.dnetlib.dhp.broker.oa; import org.apache.commons.io.IOUtils; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.MapFunction; diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java index a49801f32..a6fa2b1a1 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.broker.oa.util; import java.io.IOException; +import org.apache.spark.sql.Row; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -10,9 +11,8 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.model.SparkDeduper; import eu.dnetlib.pace.tree.support.TreeProcessor; -import eu.dnetlib.pace.util.MapDocumentUtil; public class TrustUtils { @@ -20,13 +20,18 @@ public class TrustUtils { private static DedupConfig dedupConfig; + private static SparkDeduper deduper; + + private static final ObjectMapper mapper; + static { - final ObjectMapper mapper = new ObjectMapper(); + mapper = new ObjectMapper(); try { dedupConfig = mapper .readValue( DedupConfig.class.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/dedupConfig/dedupConfig.json"), DedupConfig.class); + deduper = new SparkDeduper(dedupConfig); } catch (final IOException e) { log.error("Error loading dedupConfig, e"); } @@ -42,11 +47,8 @@ public class TrustUtils { } try { - final ObjectMapper objectMapper = new ObjectMapper(); - final MapDocument doc1 = MapDocumentUtil - .asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r1)); - final MapDocument doc2 = MapDocumentUtil - .asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r2)); + final Row doc1 = deduper.model().rowFromJson(mapper.writeValueAsString(r1)); + final Row doc2 = deduper.model().rowFromJson(mapper.writeValueAsString(r2)); final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/stats/StatsAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/stats/StatsAggregator.java index 240e2d211..0f8bad2ce 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/stats/StatsAggregator.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/stats/StatsAggregator.java @@ -1,7 +1,7 @@ package eu.dnetlib.dhp.broker.oa.util.aggregators.stats; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.expressions.Aggregator; diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/samples/SimpleVariableJobTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/samples/SimpleVariableJobTest.java index a6d1c89d3..73d1731cf 100644 --- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/samples/SimpleVariableJobTest.java +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/samples/SimpleVariableJobTest.java @@ -12,6 +12,7 @@ import java.util.stream.Collectors; import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; @@ -82,8 +83,8 @@ public class SimpleVariableJobTest { final long n = spark .createDataset(inputList, Encoders.STRING()) - .filter(s -> filter(map.get(s))) - .map((MapFunction) s -> s.toLowerCase(), Encoders.STRING()) + .filter((FilterFunction) s -> filter(map.get(s))) + .map((MapFunction) String::toLowerCase, Encoders.STRING()) .count(); System.out.println(n); @@ -96,8 +97,8 @@ public class SimpleVariableJobTest { final long n = spark .createDataset(inputList, Encoders.STRING()) - .filter(s -> filter(staticMap.get(s))) - .map((MapFunction) s -> s.toLowerCase(), Encoders.STRING()) + .filter((FilterFunction) s -> filter(staticMap.get(s))) + .map((MapFunction) String::toLowerCase, Encoders.STRING()) .count(); System.out.println(n); diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java index 136413376..68af3d699 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java @@ -88,9 +88,7 @@ abstract class AbstractSparkAction implements Serializable { "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", configProfileId)); - DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class); - dedupConfig.getPace().initModel(); - dedupConfig.getPace().initTranslationMap(); + DedupConfig dedupConfig = DedupConfig.load(conf); dedupConfig.getWf().setConfigurationId(actionSetId); return dedupConfig; diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DatePicker.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DatePicker.java index 9d767c4d2..8ed3c68b2 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DatePicker.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DatePicker.java @@ -5,14 +5,14 @@ import static java.util.Collections.reverseOrder; import static java.util.Map.Entry.comparingByValue; import static java.util.stream.Collectors.toMap; -import static org.apache.commons.lang.StringUtils.endsWith; -import static org.apache.commons.lang.StringUtils.substringBefore; +import static org.apache.commons.lang3.StringUtils.endsWith; +import static org.apache.commons.lang3.StringUtils.substringBefore; import java.time.Year; import java.util.*; import java.util.stream.Collectors; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import eu.dnetlib.dhp.schema.oaf.Field; diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java index aeb485768..a7a27f5c9 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java @@ -2,27 +2,22 @@ package eu.dnetlib.dhp.oa.dedup; import java.io.StringReader; -import java.util.*; +import java.util.ArrayList; +import java.util.List; -import org.apache.spark.SparkContext; -import org.apache.spark.util.LongAccumulator; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Element; import org.dom4j.io.SAXReader; import org.xml.sax.SAXException; -import com.google.common.collect.Sets; - import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.MapDocument; public class DedupUtility { @@ -32,43 +27,6 @@ public class DedupUtility { private DedupUtility() { } - public static Map constructAccumulator( - final DedupConfig dedupConf, final SparkContext context) { - - Map accumulators = new HashMap<>(); - - String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"); - accumulators.put(acc1, context.longAccumulator(acc1)); - String acc2 = String - .format( - "%s::%s", - dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()); - accumulators.put(acc2, context.longAccumulator(acc2)); - String acc3 = String - .format( - "%s::%s", - dedupConf.getWf().getEntityType(), - String - .format( - "Skipped records for count(%s) >= %s", - dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize())); - accumulators.put(acc3, context.longAccumulator(acc3)); - String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list"); - accumulators.put(acc4, context.longAccumulator(acc4)); - String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"); - accumulators.put(acc5, context.longAccumulator(acc5)); - String acc6 = String - .format( - "%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()); - accumulators.put(acc6, context.longAccumulator(acc6)); - - return accumulators; - } - - static Set getGroupingKeys(DedupConfig conf, MapDocument doc) { - return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf)); - } - public static String createDedupRecordPath( final String basePath, final String actionSetId, final String entityType) { return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType); diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java deleted file mode 100644 index 68201677e..000000000 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java +++ /dev/null @@ -1,58 +0,0 @@ - -package eu.dnetlib.dhp.oa.dedup; - -import java.util.Map; -import java.util.stream.Collectors; - -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.util.LongAccumulator; - -import eu.dnetlib.dhp.oa.dedup.model.Block; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.util.BlockProcessor; -import scala.Serializable; -import scala.Tuple2; - -public class Deduper implements Serializable { - - public static JavaPairRDD computeRelations( - JavaSparkContext context, JavaPairRDD blocks, DedupConfig config) { - Map accumulators = DedupUtility.constructAccumulator(config, context.sc()); - - return blocks - .flatMapToPair( - it -> { - final SparkReporter reporter = new SparkReporter(accumulators); - new BlockProcessor(config) - .processSortedBlock(it._1(), it._2().getDocuments(), reporter); - return reporter.getRelations().iterator(); - }) - .mapToPair(it -> new Tuple2<>(it._1() + it._2(), it)) - .reduceByKey((a, b) -> a) - .mapToPair(Tuple2::_2); - } - - public static JavaPairRDD createSortedBlocks( - JavaPairRDD mapDocs, DedupConfig config) { - final String of = config.getWf().getOrderField(); - final int maxQueueSize = config.getWf().getQueueMaxSize(); - - return mapDocs - // the reduce is just to be sure that we haven't document with same id - .reduceByKey((a, b) -> a) - .map(Tuple2::_2) - // Clustering: from to List - .flatMap( - a -> DedupUtility - .getGroupingKeys(config, a) - .stream() - .map(it -> Block.from(it, a)) - .collect(Collectors.toList()) - .iterator()) - .mapToPair(block -> new Tuple2<>(block.getKey(), block)) - .reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize)) - .filter(b -> b._2().getDocuments().size() > 1); - } -} diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java index c9c9dd8fe..3e5215d42 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java @@ -2,32 +2,26 @@ package eu.dnetlib.dhp.oa.dedup; import java.io.IOException; +import java.util.Collection; import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.*; import org.dom4j.DocumentException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.oa.dedup.model.Block; import eu.dnetlib.dhp.oa.dedup.model.BlockStats; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.util.MapDocumentUtil; -import scala.Tuple2; +import eu.dnetlib.pace.model.SparkDeduper; public class SparkBlockStats extends AbstractSparkAction { @@ -91,36 +85,35 @@ public class SparkBlockStats extends AbstractSparkAction { JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - JavaPairRDD mapDocuments = sc + SparkDeduper deduper = new SparkDeduper(dedupConf); + + Dataset simRels = spark + .read() .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) - .repartition(numPartitions) - .mapToPair( - (PairFunction) s -> { - MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); - return new Tuple2<>(d.getIdentifier(), d); - }); + .transform(deduper.model().parseJsonDataset()) + .transform(deduper.filterAndCleanup()) + .transform(deduper.generateClustersWithCollect()) + .filter(functions.size(new Column("block")).geq(1)); - // create blocks for deduplication - JavaRDD blockStats = Deduper - .createSortedBlocks(mapDocuments, dedupConf) - .repartition(numPartitions) - .map(b -> asBlockStats(dedupConf, b)); + simRels.map((MapFunction) row -> { + Collection mapDocuments = row.getList(row.fieldIndex("block")); - // save the blockstats in the workingdir - spark - .createDataset(blockStats.rdd(), Encoders.bean(BlockStats.class)) + /* + * List mapDocuments = documents .stream() .sorted( new + * RowDataOrderingComparator(deduper.model().orderingFieldPosition(), + * deduper.model().identityFieldPosition())) .limit(dedupConf.getWf().getQueueMaxSize()) + * .collect(Collectors.toList()); + */ + + return new BlockStats( + row.getString(row.fieldIndex("key")), + (long) mapDocuments.size(), + computeComparisons( + (long) mapDocuments.size(), (long) dedupConf.getWf().getSlidingWindowSize())); + }, Encoders.bean(BlockStats.class)) .write() .mode(SaveMode.Overwrite) .save(outputPath); } } - - private BlockStats asBlockStats(DedupConfig dedupConf, Tuple2 b) { - return new BlockStats( - b._1(), - (long) b._2().getDocuments().size(), - computeComparisons( - (long) b._2().getDocuments().size(), (long) dedupConf.getWf().getSlidingWindowSize())); - } - } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java index ca6398e8a..5b3cc3111 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java @@ -7,13 +7,9 @@ import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.*; import org.dom4j.DocumentException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -22,15 +18,12 @@ import org.xml.sax.SAXException; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.dedup.log.DedupLogModel; import eu.dnetlib.dhp.application.dedup.log.DedupLogWriter; -import eu.dnetlib.dhp.oa.dedup.model.Block; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.util.MapDocumentUtil; -import scala.Tuple2; +import eu.dnetlib.pace.model.SparkDeduper; public class SparkCreateSimRels extends AbstractSparkAction { @@ -38,6 +31,7 @@ public class SparkCreateSimRels extends AbstractSparkAction { public SparkCreateSimRels(ArgumentApplicationParser parser, SparkSession spark) { super(parser, spark); + spark.sparkContext().setLogLevel("WARN"); } public static void main(String[] args) throws Exception { @@ -79,7 +73,6 @@ public class SparkCreateSimRels extends AbstractSparkAction { // for each dedup configuration for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { - final long start = System.currentTimeMillis(); final String entity = dedupConf.getWf().getEntityType(); @@ -91,27 +84,17 @@ public class SparkCreateSimRels extends AbstractSparkAction { JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - JavaPairRDD mapDocuments = sc + SparkDeduper deduper = new SparkDeduper(dedupConf); + + Dataset simRels = spark + .read() .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) - .repartition(numPartitions) - .mapToPair( - (PairFunction) s -> { - MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); - return new Tuple2<>(d.getIdentifier(), d); - }); - - // create blocks for deduplication - JavaPairRDD blocks = Deduper - .createSortedBlocks(mapDocuments, dedupConf) - .repartition(numPartitions); - - Dataset simRels = spark - .createDataset( - Deduper - .computeRelations(sc, blocks, dedupConf) - .map(t -> DedupUtility.createSimRel(t._1(), t._2(), entity)) - .repartition(numPartitions) - .rdd(), + .transform(deduper.model().parseJsonDataset()) + .transform(deduper.dedup()) + .distinct() + .map( + (MapFunction) t -> DedupUtility + .createSimRel(t.getStruct(0).getString(0), t.getStruct(0).getString(1), entity), Encoders.bean(Relation.class)); saveParquet(simRels, outputPath, SaveMode.Overwrite); diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkReporter.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkReporter.java deleted file mode 100644 index 005e65ddf..000000000 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkReporter.java +++ /dev/null @@ -1,50 +0,0 @@ - -package eu.dnetlib.dhp.oa.dedup; - -import java.util.ArrayList; -import java.util.List; -import java.util.Map; - -import org.apache.spark.util.LongAccumulator; - -import eu.dnetlib.pace.util.Reporter; -import scala.Serializable; -import scala.Tuple2; - -public class SparkReporter implements Serializable, Reporter { - - private final List> relations = new ArrayList<>(); - - private final Map accumulators; - - public SparkReporter(Map accumulators) { - this.accumulators = accumulators; - } - - public void incrementCounter( - String counterGroup, - String counterName, - long delta, - Map accumulators) { - - final String accumulatorName = String.format("%s::%s", counterGroup, counterName); - if (accumulators.containsKey(accumulatorName)) { - accumulators.get(accumulatorName).add(delta); - } - } - - @Override - public void incrementCounter(String counterGroup, String counterName, long delta) { - - incrementCounter(counterGroup, counterName, delta, accumulators); - } - - @Override - public void emit(String type, String from, String to) { - relations.add(new Tuple2<>(from, to)); - } - - public List> getRelations() { - return relations; - } -} diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkWhitelistSimRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkWhitelistSimRels.java index 72c10e2a6..94a09ed05 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkWhitelistSimRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkWhitelistSimRels.java @@ -8,26 +8,18 @@ import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.*; import org.dom4j.DocumentException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.util.MapDocumentUtil; -import scala.Tuple2; public class SparkWhitelistSimRels extends AbstractSparkAction { @@ -78,15 +70,14 @@ public class SparkWhitelistSimRels extends AbstractSparkAction { JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); // file format: source####target - Dataset> whiteListRels = spark - .createDataset( - sc - .textFile(whiteListPath) - // check if the line is in the correct format: id1####id2 - .filter(s -> s.contains(WHITELIST_SEPARATOR) && s.split(WHITELIST_SEPARATOR).length == 2) - .map(s -> new Tuple2<>(s.split(WHITELIST_SEPARATOR)[0], s.split(WHITELIST_SEPARATOR)[1])) - .rdd(), - Encoders.tuple(Encoders.STRING(), Encoders.STRING())); + Dataset whiteListRels = spark + .read() + .textFile(whiteListPath) + .withColumn("pairs", functions.split(new Column("value"), WHITELIST_SEPARATOR)) + .filter(functions.size(new Column("pairs")).equalTo(2)) + .select( + functions.element_at(new Column("pairs"), 1).as("from"), + functions.element_at(new Column("pairs"), 2).as("to")); // for each dedup configuration for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { @@ -97,35 +88,26 @@ public class SparkWhitelistSimRels extends AbstractSparkAction { final String outputPath = DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity); - Dataset> entities = spark - .createDataset( - sc - .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) - .repartition(numPartitions) - .mapToPair( - (PairFunction) s -> { - MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); - return new Tuple2<>(d.getIdentifier(), "present"); - }) - .rdd(), - Encoders.tuple(Encoders.STRING(), Encoders.STRING())); + // DFMapDocumentUtils.registerUDFs(spark, dedupConf); - Dataset> whiteListRels1 = whiteListRels - .joinWith(entities, whiteListRels.col("_1").equalTo(entities.col("_1")), "inner") - .map( - (MapFunction, Tuple2>, Tuple2>) Tuple2::_1, - Encoders.tuple(Encoders.STRING(), Encoders.STRING())); + Dataset entities = spark + .read() + .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) + .repartition(numPartitions) + .withColumn("id", functions.get_json_object(new Column("value"), dedupConf.getWf().getIdPath())); - Dataset> whiteListRels2 = whiteListRels1 - .joinWith(entities, whiteListRels1.col("_2").equalTo(entities.col("_1")), "inner") - .map( - (MapFunction, Tuple2>, Tuple2>) Tuple2::_1, - Encoders.tuple(Encoders.STRING(), Encoders.STRING())); + Dataset whiteListRels1 = whiteListRels + .join(entities, entities.col("id").equalTo(whiteListRels.col("from")), "inner") + .select("from", "to"); + + Dataset whiteListRels2 = whiteListRels1 + .join(entities, whiteListRels1.col("to").equalTo(entities.col("id")), "inner") + .select("from", "to"); Dataset whiteListSimRels = whiteListRels2 .map( - (MapFunction, Relation>) r -> DedupUtility - .createSimRel(r._1(), r._2(), entity), + (MapFunction) r -> DedupUtility + .createSimRel(r.getString(0), r.getString(1), entity), Encoders.bean(Relation.class)); saveParquet(whiteListSimRels, outputPath, SaveMode.Append); diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java index 3e564052e..4a39a175d 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java @@ -6,7 +6,7 @@ import java.io.Serializable; import java.util.Set; import java.util.stream.Collectors; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.codehaus.jackson.annotate.JsonIgnore; import com.fasterxml.jackson.databind.ObjectMapper; diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Block.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Block.java deleted file mode 100644 index 4f0d95c8f..000000000 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Block.java +++ /dev/null @@ -1,80 +0,0 @@ - -package eu.dnetlib.dhp.oa.dedup.model; - -import java.io.Serializable; -import java.util.ArrayList; -import java.util.Comparator; -import java.util.Iterator; -import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.Stream; -import java.util.stream.StreamSupport; - -import com.google.common.collect.Lists; - -import eu.dnetlib.pace.model.MapDocument; - -public class Block implements Serializable { - - private String key; - - private List documents; - - public Block() { - super(); - } - - public static Block from(String key, MapDocument doc) { - Block block = new Block(); - block.setKey(key); - block.setDocuments(Lists.newArrayList(doc)); - return block; - } - - public static Block from(String key, Iterator blocks, String orderField, int maxSize) { - Block block = new Block(); - block.setKey(key); - - Iterable it = () -> blocks; - - block - .setDocuments( - StreamSupport - .stream(it.spliterator(), false) - .flatMap(b -> b.getDocuments().stream()) - .sorted(Comparator.comparing(a -> a.getFieldMap().get(orderField).stringValue())) - .limit(maxSize) - .collect(Collectors.toCollection(ArrayList::new))); - return block; - } - - public static Block from(Block b1, Block b2, String orderField, int maxSize) { - Block block = new Block(); - block.setKey(b1.getKey()); - block - .setDocuments( - Stream - .concat(b1.getDocuments().stream(), b2.getDocuments().stream()) - .sorted(Comparator.comparing(a -> a.getFieldMap().get(orderField).stringValue())) - .limit(maxSize) - .collect(Collectors.toCollection(ArrayList::new))); - - return block; - } - - public String getKey() { - return key; - } - - public void setKey(String key) { - this.key = key; - } - - public List getDocuments() { - return documents; - } - - public void setDocuments(List documents) { - this.documents = documents; - } -} diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java index c9cfb8cb2..42ca1613f 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java @@ -9,7 +9,8 @@ import java.io.IOException; import java.io.Serializable; import java.lang.reflect.InvocationTargetException; import java.nio.file.Paths; -import java.util.*; +import java.util.ArrayList; +import java.util.List; import java.util.stream.Collectors; import org.codehaus.jackson.map.ObjectMapper; @@ -17,7 +18,10 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import eu.dnetlib.dhp.oa.merge.AuthorMerger; -import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Software; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.pace.util.MapDocumentUtil; import scala.Tuple2; diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java index 3de14f577..6a2c6dcc5 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java @@ -178,16 +178,18 @@ public class SparkDedupTest implements Serializable { .load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "otherresearchproduct")) .count(); - assertEquals(3076, orgs_simrel); - assertEquals(7046, pubs_simrel); - assertEquals(336, sw_simrel); - assertEquals(442, ds_simrel); - assertEquals(6784, orp_simrel); -// System.out.println("orgs_simrel = " + orgs_simrel); -// System.out.println("pubs_simrel = " + pubs_simrel); -// System.out.println("sw_simrel = " + sw_simrel); -// System.out.println("ds_simrel = " + ds_simrel); -// System.out.println("orp_simrel = " + orp_simrel); + System.out.println("orgs_simrel = " + orgs_simrel); + System.out.println("pubs_simrel = " + pubs_simrel); + System.out.println("sw_simrel = " + sw_simrel); + System.out.println("ds_simrel = " + ds_simrel); + System.out.println("orp_simrel = " + orp_simrel); + + assertEquals(1538, orgs_simrel); + assertEquals(3523, pubs_simrel); + assertEquals(168, sw_simrel); + assertEquals(221, ds_simrel); + assertEquals(3392, orp_simrel); + } @Test @@ -231,10 +233,10 @@ public class SparkDedupTest implements Serializable { .count(); // entities simrels supposed to be equal to the number of previous step (no rels in whitelist) - assertEquals(3076, orgs_simrel); - assertEquals(7046, pubs_simrel); - assertEquals(442, ds_simrel); - assertEquals(6784, orp_simrel); + assertEquals(1538, orgs_simrel); + assertEquals(3523, pubs_simrel); + assertEquals(221, ds_simrel); + assertEquals(3392, orp_simrel); // System.out.println("orgs_simrel = " + orgs_simrel); // System.out.println("pubs_simrel = " + pubs_simrel); // System.out.println("ds_simrel = " + ds_simrel); @@ -264,7 +266,7 @@ public class SparkDedupTest implements Serializable { && rel.getTarget().equalsIgnoreCase(whiteList.get(1).split(WHITELIST_SEPARATOR)[1])) .count() > 0); - assertEquals(338, sw_simrel.count()); + assertEquals(170, sw_simrel.count()); // System.out.println("sw_simrel = " + sw_simrel.count()); } diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkOpenorgsDedupTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkOpenorgsDedupTest.java index 88c28ab2f..a0c7772e9 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkOpenorgsDedupTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkOpenorgsDedupTest.java @@ -143,7 +143,7 @@ public class SparkOpenorgsDedupTest implements Serializable { .load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization")) .count(); - assertEquals(290, orgs_simrel); + assertEquals(145, orgs_simrel); } @Test @@ -172,7 +172,7 @@ public class SparkOpenorgsDedupTest implements Serializable { .load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization")) .count(); - assertEquals(326, orgs_simrel); + assertEquals(181, orgs_simrel); } @Test diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkOpenorgsProvisionTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkOpenorgsProvisionTest.java index 2a9f34dee..a0bf6b37e 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkOpenorgsProvisionTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkOpenorgsProvisionTest.java @@ -11,8 +11,6 @@ import java.io.IOException; import java.io.Serializable; import java.net.URISyntaxException; import java.nio.file.Paths; -import java.util.List; -import java.util.stream.Collectors; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; @@ -31,8 +29,6 @@ import org.mockito.Mock; import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; -import com.fasterxml.jackson.databind.ObjectMapper; - import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest.java index 3cff836eb..e3fe882ef 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest.java @@ -129,7 +129,7 @@ public class SparkPublicationRootsTest implements Serializable { .load(DedupUtility.createSimRelPath(workingPath, testActionSetId, "publication")) .count(); - assertEquals(74, pubs_simrel); + assertEquals(37, pubs_simrel); } @Test diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/jpath/JsonPathTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/jpath/JsonPathTest.java index 7348a3bd2..705c2cc84 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/jpath/JsonPathTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/jpath/JsonPathTest.java @@ -1,305 +1,31 @@ package eu.dnetlib.dhp.oa.dedup.jpath; +import java.io.IOException; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.sql.Row; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import org.junit.platform.commons.util.StringUtils; import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.util.MapDocumentUtil; +import eu.dnetlib.pace.model.SparkModel; class JsonPathTest { - String json = "{\t\"dataInfo\":{\t\t\"invisible\":false,\t\t\"inferred\":false,\t\t\"deletedbyinference\":false,\t\t\"trust\":\"0.810000002384185791\",\t\t\"inferenceprovenance\":\"\",\t\t\"provenanceaction\":{\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t}\t},\t\"lastupdatetimestamp\":1584960968152,\t\"id\":\"20|corda__h2020::9faf23721249f26ac2c16eb857ea1fb9\",\t\"originalId\":[\t\t\"corda__h2020::927957582\"\t],\t\"collectedfrom\":[\t\t{\t\t\t\"key\":\"openaire____::corda_h2020\",\t\t\t\"value\":\"CORDA - COmmon Research DAta Warehouse - Horizon 2020\",\t\t\t\"dataInfo\":null\t\t}\t],\t\"pid\":[\t],\t\"dateofcollection\":\"2016-06-05\",\t\"dateoftransformation\":\"2019-11-19\",\t\"extraInfo\":[\t],\t\"oaiprovenance\":null,\t\"legalshortname\":{\t\t\"value\":\"Comentor AB\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"legalname\":{\t\t\"value\":\"Comentor AB\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"alternativeNames\":[\t],\t\"websiteurl\":{\t\t\"value\":\"http://www.comentor.se\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"logourl\":null,\t\"eclegalbody\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"eclegalperson\":{\t\t\"value\":\"true\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecnonprofit\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecresearchorganization\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"echighereducation\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecinternationalorganizationeurinterests\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecinternationalorganization\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecenterprise\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecsmevalidated\":{\t\t\"value\":\"true\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecnutscode\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"country\":null}"; - DedupConfig conf = DedupConfig - .load( - "{\n" - + " \"wf\" : {\n" - + " \"threshold\" : \"0.99\",\n" - + " \"dedupRun\" : \"001\",\n" - + " \"entityType\" : \"organization\",\n" - + " \"subEntityValue\": \"organization\",\n" - + " \"orderField\" : \"legalname\",\n" - + " \"queueMaxSize\" : \"2000\",\n" - + " \"groupMaxSize\" : \"50\",\n" - + " \"slidingWindowSize\" : \"200\",\n" - + " \"idPath\":\"$.id\",\n" - + " \"rootBuilder\" : [ \"organization\", \"projectOrganization_participation_isParticipant\", \"datasourceOrganization_provision_isProvidedBy\" ],\n" - + " \"includeChildren\" : \"true\",\n" - + " \"maxIterations\": \"20\"\n" - + " },\n" - + " \"pace\" : {\n" - + " \"clustering\" : [\n" - + " { \"name\" : \"sortedngrampairs\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\" : 2, \"ngramLen\" : \"3\"} },\n" - + " { \"name\" : \"suffixprefix\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\" : 1, \"len\" : \"3\" } },\n" - + " { \"name\" : \"urlclustering\", \"fields\" : [ \"websiteurl\" ], \"params\" : { } },\n" - + " { \"name\" : \"keywordsclustering\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\": 2, \"windowSize\": 4} }\n" - + " ],\n" - + " \"decisionTree\" : {\n" - + " \"start\": {\n" - + " \"fields\": [\n" - + " {\n" - + " \"field\": \"gridid\",\n" - + " \"comparator\": \"exactMatch\",\n" - + " \"weight\": 1,\n" - + " \"countIfUndefined\": \"false\",\n" - + " \"params\": {}\n" - + " }\n" - + " ],\n" - + " \"threshold\": 1,\n" - + " \"aggregation\": \"AVG\",\n" - + " \"positive\": \"MATCH\",\n" - + " \"negative\": \"NO_MATCH\",\n" - + " \"undefined\": \"layer2\",\n" - + " \"ignoreUndefined\": \"false\"\n" - + " },\n" - + " \"layer2\": {\n" - + " \"fields\": [\n" - + " {\n" - + " \"field\": \"websiteurl\",\n" - + " \"comparator\": \"domainExactMatch\",\n" - + " \"weight\": 1,\n" - + " \"countIfUndefined\": \"false\",\n" - + " \"params\": {}\n" - + " },\n" - + " {\n" - + " \"field\": \"country\",\n" - + " \"comparator\": \"exactMatch\",\n" - + " \"weight\": 1,\n" - + " \"countIfUndefined\": \"true\",\n" - + " \"params\": {}\n" - + " },\n" - + " {\n" - + " \"field\": \"legalname\",\n" - + " \"comparator\": \"numbersMatch\",\n" - + " \"weight\": 1,\n" - + " \"countIfUndefined\": \"true\",\n" - + " \"params\": {}\n" - + " },\n" - + " {\n" - + " \"field\": \"legalname\",\n" - + " \"comparator\": \"romansMatch\",\n" - + " \"weight\": 1,\n" - + " \"countIfUndefined\": \"true\",\n" - + " \"params\": {}\n" - + " }\n" - + " ],\n" - + " \"threshold\": 1,\n" - + " \"aggregation\": \"AND\",\n" - + " \"positive\": \"layer3\",\n" - + " \"negative\": \"NO_MATCH\",\n" - + " \"undefined\": \"layer3\",\n" - + " \"ignoreUndefined\": \"true\"\n" - + " },\n" - + " \"layer3\": {\n" - + " \"fields\": [\n" - + " {\n" - + " \"field\": \"legalname\",\n" - + " \"comparator\": \"cityMatch\",\n" - + " \"weight\": 1.0,\n" - + " \"countIfUndefined\": \"true\",\n" - + " \"params\": {\n" - + " \"windowSize\": \"4\"\n" - + " }\n" - + " }\n" - + " ],\n" - + " \"threshold\": 0.1,\n" - + " \"aggregation\": \"AVG\",\n" - + " \"positive\": \"layer4\",\n" - + " \"negative\": \"NO_MATCH\",\n" - + " \"undefined\": \"NO_MATCH\",\n" - + " \"ignoreUndefined\": \"true\"\n" - + " },\n" - + " \"layer4\": {\n" - + " \"fields\": [\n" - + " {\n" - + " \"field\": \"legalname\",\n" - + " \"comparator\": \"keywordMatch\",\n" - + " \"weight\": 1.0,\n" - + " \"countIfUndefined\": \"true\",\n" - + " \"params\": {\n" - + " \"windowSize\": \"4\"\n" - + " }\n" - + " }\n" - + " ],\n" - + " \"threshold\": 0.7,\n" - + " \"aggregation\": \"AVG\",\n" - + " \"positive\": \"layer5\",\n" - + " \"negative\": \"NO_MATCH\",\n" - + " \"undefined\": \"layer5\",\n" - + " \"ignoreUndefined\": \"true\"\n" - + " },\n" - + " \"layer5\": {\n" - + " \"fields\": [\n" - + " {\n" - + " \"field\": \"legalname\",\n" - + " \"comparator\": \"jaroWinklerNormalizedName\",\n" - + " \"weight\": 0.9,\n" - + " \"countIfUndefined\": \"true\",\n" - + " \"params\": {\n" - + " \"windowSize\": \"4\"\n" - + " }\n" - + " },\n" - + " {\n" - + " \"field\": \"legalshortname\",\n" - + " \"comparator\": \"jaroWinklerNormalizedName\",\n" - + " \"weight\": 0.1,\n" - + " \"countIfUndefined\": \"false\",\n" - + " \"params\": {\n" - + " \"windowSize\": 4\n" - + " }\n" - + " }\n" - + " ],\n" - + " \"threshold\": 0.9,\n" - + " \"aggregation\": \"W_MEAN\",\n" - + " \"positive\": \"MATCH\",\n" - + " \"negative\": \"NO_MATCH\",\n" - + " \"undefined\": \"NO_MATCH\",\n" - + " \"ignoreUndefined\": \"true\"\n" - + " }\n" - + " },\n" - + " \"model\" : [\n" - + " { \"name\" : \"country\", \"type\" : \"String\", \"path\" : \"$.country.classid\"},\n" - + " { \"name\" : \"legalshortname\", \"type\" : \"String\", \"path\" : \"$.legalshortname.value\"},\n" - + " { \"name\" : \"legalname\", \"type\" : \"String\", \"path\" : \"$.legalname.value\" },\n" - + " { \"name\" : \"websiteurl\", \"type\" : \"URL\", \"path\" : \"$.websiteurl.value\" },\n" - + " { \"name\" : \"gridid\", \"type\" : \"String\", \"path\" : \"$.pid[?(@.qualifier.classid =='grid')].value\"},\n" - + " { \"name\" : \"originalId\", \"type\" : \"String\", \"path\" : \"$.id\" }\n" - + " ],\n" - + " \"blacklists\" : {\n" - + " \"legalname\" : []\n" - + " },\n" - + " \"synonyms\": {\n" - + " \"key::1\": [\"university\",\"università\", \"universitas\", \"università studi\",\"universitario\",\"universitaria\",\"université\", \"universite\", \"universitaire\",\"universitaires\",\"universidad\",\"universitade\",\"Universität\",\"universitaet\",\"Uniwersytet\",\"университет\",\"universiteit\",\"πανεπιστήμιο\",\"universitesi\",\"universiteti\", \"universiti\"],\n" - + " \"key::2\": [\"studies\",\"studi\",\"études\",\"estudios\",\"estudos\",\"Studien\",\"studia\",\"исследования\",\"studies\",\"σπουδές\"],\n" - + " \"key::3\": [\"advanced\",\"superiore\",\"supérieur\",\"supérieure\",\"supérieurs\",\"supérieures\",\"avancado\",\"avancados\",\"fortgeschrittene\",\"fortgeschritten\",\"zaawansowany\",\"передовой\",\"gevorderd\",\"gevorderde\",\"προχωρημένος\",\"προχωρημένη\",\"προχωρημένο\",\"προχωρημένες\",\"προχωρημένα\",\"wyzsza\"],\n" - + " \"key::4\": [\"institute\",\"istituto\",\"institut\",\"instituto\",\"instituto\",\"Institut\",\"instytut\",\"институт\",\"instituut\",\"ινστιτούτο\"],\n" - + " \"key::5\": [\"hospital\",\"ospedale\",\"hôpital\",\"hospital\",\"hospital\",\"Krankenhaus\",\"szpital\",\"больница\",\"ziekenhuis\",\"νοσοκομείο\"],\n" - + " \"key::6\": [\"research\",\"ricerca\",\"recherche\",\"investigacion\",\"pesquisa\",\"Forschung\",\"badania\",\"исследования\",\"onderzoek\",\"έρευνα\",\"erevna\",\"erevnas\"],\n" - + " \"key::7\": [\"college\",\"collegio\",\"colegio\",\"faculdade\",\"Hochschule\",\"Szkoła Wyższa\",\"Высшая школа\",\"κολλέγιο\"],\n" - + " \"key::8\": [\"foundation\",\"fondazione\",\"fondation\",\"fundación\",\"fundação\",\"Stiftung\",\"Fundacja\",\"фонд\",\"stichting\",\"ίδρυμα\",\"idryma\"],\n" - + " \"key::9\": [\"center\",\"centro\",\"centre\",\"centro\",\"centro\",\"zentrum\",\"centrum\",\"центр\",\"centrum\",\"κέντρο\"],\n" - + " \"key::10\": [\"national\",\"nazionale\",\"national\",\"nationale\",\"nationaux\",\"nationales\",\"nacional\",\"nacional\",\"national\",\"krajowy\",\"национальный\",\"nationaal\",\"nationale\",\"εθνικό\"],\n" - + " \"key::11\": [\"association\",\"associazione\",\"association\",\"asociación\",\"associação\",\"Verein\",\"verband\",\"stowarzyszenie\",\"ассоциация\",\"associatie\"],\n" - + " \"key::12\": [\"society\",\"societa\",\"société\",\"sociedad\",\"sociedade\",\"gesellschaft\",\"społeczeństwo\",\"общество\",\"maatschappij\",\"κοινωνία\"],\n" - + " \"key::13\": [\"international\",\"internazionale\",\"international\",\"internacional\",\"internacional\",\"international\",\"międzynarodowy\",\"Международный\",\"internationaal\",\"internationale\",\"διεθνής\",\"διεθνή\",\"διεθνές\"],\n" - + " \"key::14\": [\"community\",\"comunita\",\"communauté\",\"comunidad\",\"comunidade\",\"Gemeinschaft\",\"społeczność\",\"сообщество\",\"gemeenschap\",\"κοινότητα\"],\n" - + " \"key::15\": [\"school\",\"scuola\",\"école\",\"escuela\",\"escola\",\"schule\",\"Szkoła\",\"школа\",\"school\",\"σχολείο\"],\n" - + " \"key::16\": [\"education\",\"educazione\",\"éducation\",\"educacion\",\"Educação\",\"Bildung\",\"Edukacja\",\"образование\",\"opleiding\",\"εκπαίδευση\"],\n" - + " \"key::17\": [\"academy\",\"accademia\",\"académie\",\"academia\",\"academia\",\"Akademie\",\"akademie\",\"академия\",\"academie\",\"ακαδημία\"],\n" - + " \"key::18\": [\"public\",\"pubblico\",\"public\",\"publique\",\"publics\",\"publiques\",\"publico\",\"publico\",\"Öffentlichkeit\",\"publiczny\",\"публичный\",\"publiek\",\"publieke\",\"δημόσιος\",\"δημόσια\",\"δημόσιο\"],\n" - + " \"key::19\": [\"museum\",\"museo\",\"musée\",\"mueso\",\"museu\",\"museum\",\"muzeum\",\"музей\",\"museum\",\"μουσείο\"],\n" - + " \"key::20\": [\"group\",\"gruppo\",\"groupe\",\"grupo\",\"grupo\",\"gruppe\",\"grupa\",\"группа\",\"groep\",\"ομάδα\",\"όμιλος\"],\n" - + " \"key::21\": [\"department\",\"dipartimento\",\"département\",\"departamento\",\"departamento\",\"abteilung\",\"departament\",\"отдел\",\"afdeling\",\"τμήμα\"],\n" - + " \"key::22\": [\"council\",\"consiglio\",\"conseil\",\"Consejo\",\"conselho\",\"gesellschaft\",\"rada\",\"совет\",\"raad\",\"συμβούλιο\"],\n" - + " \"key::23\": [\"library\",\"biblioteca\",\"bibliothèque\",\"biblioteca\",\"biblioteca\",\"Bibliothek\",\"biblioteka\",\"библиотека\",\"bibliotheek\",\"βιβλιοθήκη\"],\n" - + " \"key::24\": [\"ministry\",\"ministero\",\"ministère\",\"ministerio\",\"ministério\",\"Ministerium\",\"ministerstwo\",\"министерство\",\"ministerie\",\"υπουργείο\"],\n" - + " \"key::25\": [\"services\",\"servizi\",\"services\",\"servicios\",\"Serviços\",\"Dienstleistungen\",\"usługi\",\"услуги\",\"diensten\",\"υπηρεσίες\"],\n" - + " \"key::26\": [\"central\",\"centrale\",\"central\",\"centrale\",\"centrales\",\"central\",\"central\",\"zentral\",\"centralny\",\"цетральный\",\"centraal\",\"κεντρικός\",\"κεντρική\",\"κεντρικό\",\"κεντρικά\"],\n" - + " \"key::27\": [\"general\",\"generale\",\"général\",\"générale\",\"généraux\",\"générales\",\"general\",\"geral\",\"general\",\"Allgemeines\",\"general\",\"общий\",\"algemeen\",\"algemene\",\"γενικός\",\"γενική\",\"γενικό\",\"γενικά\"],\n" - + " \"key::28\": [\"applied\",\"applicati\",\"appliqué\",\"appliquée\",\"appliqués\",\"appliquées\",\"aplicado\",\"aplicada\",\"angewendet\",\"stosowany\",\"прикладной\",\"toegepast\",\"toegepaste\",\"εφαρμοσμένος\",\"εφαρμοσμένη\",\"εφαρμοσμένο\",\"εφαρμοσμένα\"],\n" - + " \"key::29\": [\"european\",\"europee\",\"europea\",\"européen\",\"européenne\",\"européens\",\"européennes\",\"europeo\",\"europeu\",\"europäisch\",\"europejski\",\"европейский\",\"Europees\",\"Europese\",\"ευρωπαϊκός\",\"ευρωπαϊκή\",\"ευρωπαϊκό\",\"ευρωπαϊκά\"],\n" - + " \"key::30\": [\"agency\",\"agenzia\",\"agence\",\"agencia\",\"agencia\",\"agentur\",\"agencja\",\"агенция\",\"agentschap\",\"πρακτορείο\"],\n" - + " \"key::31\": [\"laboratory\",\"laboratorio\",\"laboratoire\",\"laboratorio\",\"laboratorio\",\"labor\",\"laboratorium\",\"лаборатория\",\"laboratorium\",\"εργαστήριο\"],\n" - + " \"key::32\": [\"industry\",\"industria\",\"industrie\",\"индустрия\",\"industrie\",\"βιομηχανία\"],\n" - + " \"key::33\": [\"industrial\",\"industriale\",\"industriel\",\"industrielle\",\"industriels\",\"industrielles\",\"индустриальный\",\"industrieel\",\"βιομηχανικός\",\"βιομηχανική\",\"βιομηχανικό\",\"βιομηχανικά\",\"βιομηχανικές\"],\n" - + " \"key::34\": [\"consortium\",\"consorzio\",\"consortium\",\"консорциум\",\"consortium\",\"κοινοπραξία\"],\n" - + " \"key::35\": [\"organization\",\"organizzazione\",\"organisation\",\"organización\",\"organização\",\"organizacja\",\"организация\",\"organisatie\",\"οργανισμός\"],\n" - + " \"key::36\": [\"authority\",\"autorità\",\"autorité\",\"авторитет\",\"autoriteit\"],\n" - + " \"key::37\": [\"federation\",\"federazione\",\"fédération\",\"федерация\",\"federatie\",\"ομοσπονδία\"],\n" - + " \"key::38\": [\"observatory\",\"osservatorio\",\"observatoire\",\"обсерватория\",\"observatorium\",\"αστεροσκοπείο\"],\n" - + " \"key::39\": [\"bureau\",\"ufficio\",\"bureau\",\"офис\",\"bureau\",\"γραφείο\"],\n" - + " \"key::40\": [\"company\",\"impresa\",\"compagnie\",\"société\",\"компания\",\"bedrijf\",\"εταιρία\"],\n" - + " \"key::41\": [\"polytechnic\",\"politecnico\",\"polytechnique\",\"политехника\",\"polytechnisch\",\"πολυτεχνείο\",\"universita politecnica\",\"polytechnic university\",\"universidad politecnica\",\"universitat politecnica\",\"politechnika\",\"politechniki\",\"university technology\",\"university science technology\"],\n" - + " \"key::42\": [\"coalition\",\"coalizione\",\"coalition\",\"коалиция\",\"coalitie\",\"συνασπισμός\"],\n" - + " \"key::43\": [\"initiative\",\"iniziativa\",\"initiative\",\"инициатива\",\"initiatief\",\"πρωτοβουλία\"],\n" - + " \"key::44\": [\"academic\",\"accademico\",\"académique\",\"universitaire\",\"акадеческий academisch\",\"ακαδημαϊκός\",\"ακαδημαϊκή\",\"ακαδημαϊκό\",\"ακαδημαϊκές\",\"ακαδημαϊκοί\"],\n" - + " \"key::45\": [\"institution\",\"istituzione\",\"institution\",\"институциональный\",\"instelling\",\"ινστιτούτο\"],\n" - + " \"key::46\": [\"division\",\"divisione\",\"division\",\"отделение\",\"divisie\",\"τμήμα\"],\n" - + " \"key::47\": [\"committee\",\"comitato\",\"comité\",\"комитет\",\"commissie\",\"επιτροπή\"],\n" - + " \"key::48\": [\"promotion\",\"promozione\",\"продвижение\",\"proothisis\",\"forderung\"],\n" - + " \"key::49\": [\"medical\",\"medicine\",\"clinical\",\"medicina\",\"clinici\",\"médico\",\"medicina\",\"clínica\",\"médico\",\"medicina\",\"clínica\",\"medizinisch\",\"Medizin\",\"klinisch\",\"medisch\",\"geneeskunde\",\"klinisch\",\"ιατρικός\",\"ιατρική\",\"ιατρικό\",\"ιατρικά\",\"κλινικός\",\"κλινική\",\"κλινικό\",\"κλινικά\",\"tıbbi\",\"tıp\",\"klinik\",\"orvosi\",\"orvostudomány\",\"klinikai\",\"zdravniški\",\"medicinski\",\"klinični\",\"meditsiini\",\"kliinik\",\"kliiniline\"],\n" - + " \"key::50\": [\"technology\",\"technological\",\"tecnologia\",\"tecnologie\",\"tecnología\",\"tecnológico\",\"tecnologia\",\"tecnológico\",\"Technologie\",\"technologisch\",\"technologie\",\"technologisch\",\"τεχνολογία\",\"τεχνολογικός\",\"τεχνολογική\",\"τεχνολογικό\",\"teknoloji\",\"teknolojik\",\"technológia\",\"technológiai\",\"tehnologija\",\"tehnološki\",\"tehnoloogia\",\"tehnoloogiline\",\"technologii\",\"technical\",\"texniki\",\"teknik\"],\n" - + " \"key::51\": [\"science\",\"scientific\",\"scienza\",\"scientifiche\",\"scienze\",\"ciencia\",\"científico\",\"ciência\",\"científico\",\"Wissenschaft\",\"wissenschaftlich\",\"wetenschap\",\"wetenschappelijk\",\"επιστήμη\",\"επιστημονικός\",\"επιστημονική\",\"επιστημονικό\",\"επιστημονικά\",\"bilim\",\"bilimsel\",\"tudomány\",\"tudományos\",\"znanost\",\"znanstveni\",\"teadus\",\"teaduslik\",\"\"],\n" - + " \"key::52\": [\"engineering\",\"ingegneria\",\"ingeniería\",\"engenharia\",\"Ingenieurwissenschaft\",\"ingenieurswetenschappen\",\"bouwkunde\",\"μηχανικός\",\"μηχανική\",\"μηχανικό\",\"mühendislik\",\"mérnöki\",\"Inženirstvo\",\"inseneeria\",\"inseneri\",\"\"],\n" - + " \"key::53\": [\"management\",\"gestione\",\"gestionale\",\"gestionali\",\"gestión\",\"administración\",\"gestão\",\"administração\",\"Verwaltung\",\"management\",\"διαχείριση\",\"yönetim\",\"menedzsment\",\"vodstvo\",\"upravljanje\",\"management\",\"juhtkond\",\"juhtimine\",\"haldus\",\"\"],\n" - + " \"key::54\": [\"energy\",\"energia\",\"energía\",\"energia\",\"Energie\",\"energie\",\"ενέργεια\",\"enerji\",\"energia\",\"energija\",\"energia\",\"\"],\n" - + " \"key::55\": [\"agricultural\",\"agriculture\",\"agricoltura\",\"agricole\",\"agrícola\",\"agricultura\",\"agrícola\",\"agricultura\",\"landwirtschaftlich\",\"Landwirtschaft\",\"landbouwkundig\",\"landbouw\",\"αγροτικός\",\"αγροτική\",\"αγροτικό\",\"γεωργικός\",\"γεωργική\",\"γεωργικό\",\"γεωργία\",\"tarımsal\",\"tarım\",\"mezőgazdasági\",\"mezőgazdaság\",\"poljedelski\",\"poljedelstvo\",\"põllumajandus\",\"põllumajanduslik\",\"\"],\n" - + " \"key::56\": [\"information\",\"informazione\",\"información\",\"informação\",\"Information\",\"informatie\",\"πληροφορία\",\"bilgi\",\"információ\",\"informacija\",\"informatsioon\",\"informatycznych\",\"\"],\n" - + " \"key::57\": [\"social\",\"sociali\",\"social\",\"social\",\"Sozial\",\"sociaal\",\"maatschappelijk\",\"κοινωνικός\",\"κοινωνική\",\"κοινωνικό\",\"κοινωνικά\",\"sosyal\",\"szociális\",\"družbeni\",\"sotsiaal\",\"sotsiaalne\",\"\"],\n" - + " \"key::58\": [\"environmental\",\"ambiente\",\"medioambiental\",\"ambiente\",\"medioambiente\",\"meioambiente\",\"Umwelt\",\"milieu\",\"milieuwetenschap\",\"milieukunde\",\"περιβαλλοντικός\",\"περιβαλλοντική\",\"περιβαλλοντικό\",\"περιβαλλοντικά\",\"çevre\",\"környezeti\",\"okoliški\",\"keskonna\",\"\"],\n" - + " \"key::59\": [\"business\",\"economia\",\"economiche\",\"economica\",\"negocio\",\"empresa\",\"negócio\",\"Unternehmen\",\"bedrijf\",\"bedrijfskunde\",\"επιχείρηση\",\"iş\",\"üzleti\",\"posel\",\"ettevõte/äri\",\"\"],\n" - + " \"key::60\": [\"pharmaceuticals\",\"pharmacy\",\"farmacia\",\"farmaceutica\",\"farmacéutica\",\"farmacia\",\"farmacêutica\",\"farmácia\",\"Pharmazeutika\",\"Arzneimittelkunde\",\"farmaceutica\",\"geneesmiddelen\",\"apotheek\",\"φαρμακευτικός\",\"φαρμακευτική\",\"φαρμακευτικό\",\"φαρμακευτικά\",\"φαρμακείο\",\"ilaç\",\"eczane\",\"gyógyszerészeti\",\"gyógyszertár\",\"farmacevtika\",\"lekarništvo\",\"farmaatsia\",\"farmatseutiline\",\"\"],\n" - + " \"key::61\": [\"healthcare\",\"health services\",\"salute\",\"atenciónmédica\",\"cuidadodelasalud\",\"cuidadoscomasaúde\",\"Gesundheitswesen\",\"gezondheidszorg\",\"ιατροφαρμακευτικήπερίθαλψη\",\"sağlıkhizmeti\",\"egészségügy\",\"zdravstvo\",\"tervishoid\",\"tervishoiu\",\"\"],\n" - + " \"key::62\": [\"history\",\"storia\",\"historia\",\"história\",\"Geschichte\",\"geschiedenis\",\"geschiedkunde\",\"ιστορία\",\"tarih\",\"történelem\",\"zgodovina\",\"ajalugu\",\"\"],\n" - + " \"key::63\": [\"materials\",\"materiali\",\"materia\",\"materiales\",\"materiais\",\"materialen\",\"υλικά\",\"τεκμήρια\",\"malzemeler\",\"anyagok\",\"materiali\",\"materjalid\",\"vahendid\",\"\"],\n" - + " \"key::64\": [\"economics\",\"economia\",\"economiche\",\"economica\",\"economía\",\"economia\",\"Wirtschaft\",\"economie\",\"οικονομικά\",\"οικονομικέςεπιστήμες\",\"ekonomi\",\"közgazdaságtan\",\"gospodarstvo\",\"ekonomija\",\"majanduslik\",\"majandus\",\"\"],\n" - + " \"key::65\": [\"therapeutics\",\"terapeutica\",\"terapéutica\",\"terapêutica\",\"therapie\",\"θεραπευτική\",\"tedavibilimi\",\"gyógykezelés\",\"terapevtika\",\"terapeutiline\",\"ravi\",\"\"],\n" - + " \"key::66\": [\"oncology\",\"oncologia\",\"oncologico\",\"oncología\",\"oncologia\",\"Onkologie\",\"oncologie\",\"ογκολογία\",\"onkoloji\",\"onkológia\",\"onkologija\",\"onkoloogia\",\"\"],\n" - + " \"key::67\": [\"natural\",\"naturali\",\"naturale\",\"natural\",\"natural\",\"natürlich\",\"natuurlijk\",\"φυσικός\",\"φυσική\",\"φυσικό\",\"φυσικά\",\"doğal\",\"természetes\",\"naraven\",\"loodus\",\"\"],\n" - + " \"key::68\": [\"educational\",\"educazione\",\"pedagogia\",\"educacional\",\"educativo\",\"educacional\",\"pädagogisch\",\"educatief\",\"εκπαιδευτικός\",\"εκπαιδευτική\",\"εκπαιδευτικό\",\"εκπαιδευτικά\",\"eğitimsel\",\"oktatási\",\"izobraževalen\",\"haridus\",\"hariduslik\",\"\"],\n" - + " \"key::69\": [\"biomedical\",\"biomedica\",\"biomédico\",\"biomédico\",\"biomedizinisch\",\"biomedisch\",\"βιοιατρικός\",\"βιοιατρική\",\"βιοιατρικό\",\"βιοιατρικά\",\"biyomedikal\",\"orvosbiológiai\",\"biomedicinski\",\"biomeditsiiniline\",\"\"],\n" - + " \"key::70\": [\"veterinary\",\"veterinaria\",\"veterinarie\",\"veterinaria\",\"veterinária\",\"tierärtzlich\",\"veterinair\",\"veeartsenijlkunde\",\"κτηνιατρικός\",\"κτηνιατρική\",\"κτηνιατρικό\",\"κτηνιατρικά\",\"veteriner\",\"állatorvosi\",\"veterinar\",\"veterinarski\",\"veterinaaria\",\"\"],\n" - + " \"key::71\": [\"chemistry\",\"chimica\",\"química\",\"química\",\"Chemie\",\"chemie\",\"scheikunde\",\"χημεία\",\"kimya\",\"kémia\",\"kemija\",\"keemia\",\"\"],\n" - + " \"key::72\": [\"security\",\"sicurezza\",\"seguridad\",\"segurança\",\"Sicherheit\",\"veiligheid\",\"ασφάλεια\",\"güvenlik\",\"biztonsági\",\"varnost\",\"turvalisus\",\"julgeolek\",\"\"],\n" - + " \"key::73\": [\"biotechnology\",\"biotecnologia\",\"biotecnologie\",\"biotecnología\",\"biotecnologia\",\"Biotechnologie\",\"biotechnologie\",\"βιοτεχνολογία\",\"biyoteknoloji\",\"biotechnológia\",\"biotehnologija\",\"biotehnoloogia\",\"\"],\n" - + " \"key::74\": [\"military\",\"militare\",\"militari\",\"militar\",\"militar\",\"Militär\",\"militair\",\"leger\",\"στρατιωτικός\",\"στρατιωτική\",\"στρατιωτικό\",\"στρατιωτικά\",\"askeri\",\"katonai\",\"vojaški\",\"vojni\",\"militaar\",\"wojskowa\",\"\"],\n" - + " \"key::75\": [\"theological\",\"teologia\",\"teologico\",\"teológico\",\"tecnológica\",\"theologisch\",\"theologisch\",\"θεολογικός\",\"θεολογική\",\"θεολογικό\",\"θεολογικά\",\"teolojik\",\"technológiai\",\"teološki\",\"teoloogia\",\"usuteadus\",\"teoloogiline\",\"\"],\n" - + " \"key::76\": [\"electronics\",\"elettronica\",\"electrónica\",\"eletrônicos\",\"Elektronik\",\"elektronica\",\"ηλεκτρονική\",\"elektronik\",\"elektronika\",\"elektronika\",\"elektroonika\",\"\"],\n" - + " \"key::77\": [\"forestry\",\"forestale\",\"forestali\",\"silvicultura\",\"forestal\",\"floresta\",\"Forstwirtschaft\",\"bosbouw\",\"δασοκομία\",\"δασολογία\",\"ormancılık\",\"erdészet\",\"gozdarstvo\",\"metsandus\",\"\"],\n" - + " \"key::78\": [\"maritime\",\"marittima\",\"marittime\",\"marittimo\",\"marítimo\",\"marítimo\",\"maritiem\",\"ναυτικός\",\"ναυτική\",\"ναυτικό\",\"ναυτικά\",\"ναυτιλιακός\",\"ναυτιλιακή\",\"ναυτιλιακό\",\"ναυτιλιακά\",\"θαλάσσιος\",\"θαλάσσια\",\"θαλάσσιο\",\"denizcilik\",\"tengeri\",\"morski\",\"mere\",\"merendus\",\"\"],\n" - + " \"key::79\": [\"sports\",\"sport\",\"deportes\",\"esportes\",\"Sport\",\"sport\",\"sportwetenschappen\",\"άθληση\",\"γυμναστικήδραστηριότητα\",\"spor\",\"sport\",\"šport\",\"sport\",\"spordi\",\"\"],\n" - + " \"key::80\": [\"surgery\",\"chirurgia\",\"chirurgiche\",\"cirugía\",\"cirurgia\",\"Chirurgie\",\"chirurgie\",\"heelkunde\",\"εγχείρηση\",\"επέμβαση\",\"χειρουργικήεπέμβαση\",\"cerrahi\",\"sebészet\",\"kirurgija\",\"kirurgia\",\"\"],\n" - + " \"key::81\": [\"cultural\",\"culturale\",\"culturali\",\"cultura\",\"cultural\",\"cultural\",\"kulturell\",\"cultureel\",\"πολιτιστικός\",\"πολιτιστική\",\"πολιτιστικό\",\"πολιτισμικός\",\"πολιτισμική\",\"πολιτισμικό\",\"kültürel\",\"kultúrális\",\"kulturni\",\"kultuuri\",\"kultuuriline\",\"\"],\n" - + " \"key::82\": [\"computerscience\",\"informatica\",\"ordenador\",\"computadora\",\"informática\",\"computación\",\"cienciasdelacomputación\",\"ciênciadacomputação\",\"Computer\",\"computer\",\"υπολογιστής\",\"ηλεκτρονικόςυπολογιστής\",\"bilgisayar\",\"számítógép\",\"računalnik\",\"arvuti\",\"\"],\n" - + " \"key::83\": [\"finance\",\"financial\",\"finanza\",\"finanziarie\",\"finanza\",\"financiero\",\"finanças\",\"financeiro\",\"Finanzen\",\"finanziell\",\"financiën\",\"financieel\",\"χρηματοοικονομικά\",\"χρηματοδότηση\",\"finanse\",\"finansal\",\"pénzügy\",\"pénzügyi\",\"finance\",\"finančni\",\"finants\",\"finantsiline\",\"\"],\n" - + " \"key::84\": [\"communication\",\"comunicazione\",\"comuniciación\",\"comunicação\",\"Kommunikation\",\"communication\",\"επικοινωνία\",\"iletişim\",\"kommunikáció\",\"komuniciranje\",\"kommunikatsioon\",\"\"],\n" - + " \"key::85\": [\"justice\",\"giustizia\",\"justicia\",\"justiça\",\"Recht\",\"Justiz\",\"justitie\",\"gerechtigheid\",\"δικαιοσύνη\",\"υπουργείοδικαιοσύνης\",\"δίκαιο\",\"adalet\",\"igazságügy\",\"pravo\",\"õigus\",\"\"],\n" - + " \"key::86\": [\"aerospace\",\"aerospaziale\",\"aerospaziali\",\"aeroespacio\",\"aeroespaço\",\"Luftfahrt\",\"luchtvaart\",\"ruimtevaart\",\"αεροπορικός\",\"αεροπορική\",\"αεροπορικό\",\"αεροναυπηγικός\",\"αεροναυπηγική\",\"αεροναυπηγικό\",\"αεροναυπηγικά\",\"havacılıkveuzay\",\"légtér\",\"zrakoplovstvo\",\"atmosfäär\",\"kosmos\",\"\"],\n" - + " \"key::87\": [\"dermatology\",\"dermatologia\",\"dermatología\",\"dermatologia\",\"Dermatologie\",\"dermatologie\",\"δρματολογία\",\"dermatoloji\",\"bőrgyógyászat\",\"dermatológia\",\"dermatologija\",\"dermatoloogia\",\"\"],\n" - + " \"key::88\": [\"architecture\",\"architettura\",\"arquitectura\",\"arquitetura\",\"Architektur\",\"architectuur\",\"αρχιτεκτονική\",\"mimarlık\",\"építészet\",\"arhitektura\",\"arhitektuur\",\"\"],\n" - + " \"key::89\": [\"mathematics\",\"matematica\",\"matematiche\",\"matemáticas\",\"matemáticas\",\"Mathematik\",\"wiskunde\",\"mathematica\",\"μαθηματικά\",\"matematik\",\"matematika\",\"matematika\",\"matemaatika\",\"\"],\n" - + " \"key::90\": [\"language\",\"lingue\",\"linguistica\",\"linguistiche\",\"lenguaje\",\"idioma\",\"língua\",\"idioma\",\"Sprache\",\"taal\",\"taalkunde\",\"γλώσσα\",\"dil\",\"nyelv\",\"jezik\",\"keel\",\"\"],\n" - + " \"key::91\": [\"neuroscience\",\"neuroscienza\",\"neurociencia\",\"neurociência\",\"Neurowissenschaft\",\"neurowetenschappen\",\"νευροεπιστήμη\",\"nörobilim\",\"idegtudomány\",\"nevroznanost\",\"neuroteadused\",\"\"],\n" - + " \"key::92\": [\"automation\",\"automazione\",\"automatización\",\"automação\",\"Automatisierung\",\"automatisering\",\"αυτοματοποίηση\",\"otomasyon\",\"automatizálás\",\"avtomatizacija\",\"automatiseeritud\",\"\"],\n" - + " \"key::93\": [\"pediatric\",\"pediatria\",\"pediatriche\",\"pediatrico\",\"pediátrico\",\"pediatría\",\"pediátrico\",\"pediatria\",\"pädiatrisch\",\"pediatrische\",\"παιδιατρική\",\"pediatrik\",\"gyermekgyógyászat\",\"pediatrija\",\"pediaatria\",\"\"],\n" - + " \"key::94\": [\"photonics\",\"fotonica\",\"fotoniche\",\"fotónica\",\"fotônica\",\"Photonik\",\"fotonica\",\"φωτονική\",\"fotonik\",\"fotonika\",\"fotonika\",\"fotoonika\",\"\"],\n" - + " \"key::95\": [\"mechanics\", \"mechanical\", \"meccanica\",\"meccaniche\",\"mecánica\",\"mecânica\",\"Mechanik\",\"Maschinenbau\",\"mechanica\",\"werktuigkunde\",\"μηχανικής\",\"mekanik\",\"gépészet\",\"mehanika\",\"mehaanika\",\"\"],\n" - + " \"key::96\": [\"psychiatrics\",\"psichiatria\",\"psichiatrica\",\"psichiatriche\",\"psiquiatría\",\"psiquiatria\",\"Psychiatrie\",\"psychiatrie\",\"ψυχιατρική\",\"psikiyatrik\",\"pszihiátria\",\"psihiatrija\",\"psühhaatria\",\"\"],\n" - + " \"key::97\": [\"psychology\",\"fisiologia\",\"psicología\",\"psicologia\",\"Psychologie\",\"psychologie\",\"ψυχολογία\",\"psikoloji\",\"pszihológia\",\"psihologija\",\"psühholoogia\",\"\"],\n" - + " \"key::98\": [\"automotive\",\"industriaautomobilistica\",\"industriadelautomóvil\",\"automotriz\",\"industriaautomotriz\",\"automotivo\",\"Automobilindustrie\",\"autoindustrie\",\"αυτοκίνητος\",\"αυτοκίνητη\",\"αυτοκίνητο\",\"αυτοκινούμενος\",\"αυτοκινούμενη\",\"αυτοκινούμενο\",\"αυτοκινητιστικός\",\"αυτοκινητιστική\",\"αυτοκινητιστικό\",\"otomotiv\",\"autóipari\",\"samogiben\",\"avtomobilskaindustrija\",\"auto-\",\"\"],\n" - + " \"key::99\": [\"neurology\",\"neurologia\",\"neurologiche\",\"neurología\",\"neurologia\",\"Neurologie\",\"neurologie\",\"zenuwleer\",\"νευρολογία\",\"nöroloji\",\"neurológia\",\"ideggyógyászat\",\"nevrologija\",\"neuroloogia\",\"\"],\n" - + " \"key::100\": [\"geology\",\"geologia\",\"geologiche\",\"geología\",\"geologia\",\"Geologie\",\"geologie\",\"aardkunde\",\"γεωλογία\",\"jeoloji\",\"geológia\",\"földtudomány\",\"geologija\",\"geoloogia\",\"\"],\n" - + " \"key::101\": [\"microbiology\",\"microbiologia\",\"micro-biologia\",\"microbiologiche\",\"microbiología\",\"microbiologia\",\"Mikrobiologie\",\"microbiologie\",\"μικροβιολογία\",\"mikrobiyoloji\",\"mikrobiológia\",\"mikrobiologija\",\"mikrobioloogia\",\"\"],\n" - + " \"key::102\": [\"informatics\",\"informatica\",\"informática\",\"informática\",\"informatica\",\"\"],\n" - + " \"key::103\": [\"forschungsgemeinschaft\",\"comunita ricerca\",\"research community\",\"research foundation\",\"research association\"],\n" - + " \"key::104\": [\"commerce\",\"ticaret\",\"ticarət\",\"commercio\",\"trade\",\"handel\",\"comercio\"],\n" - + " \"key::105\" : [\"state\", \"stato\", \"etade\", \"estado\", \"statale\", \"etat\", \"zustand\", \"estado\"],\n" - + " \"key::106\" : [\"seminary\", \"seminario\", \"seminaire\", \"seminar\"],\n" - + " \"key::107\" : [\"agricultural forestry\", \"af\", \"a f\"],\n" - + " \"key::108\" : [\"agricultural mechanical\", \"am\", \"a m\"],\n" - + " \"key::109\" : [\"catholic\", \"catholique\", \"katholische\", \"catolica\", \"cattolica\", \"catolico\"]\n" - + " }\n" - + " }\n" - + "}"); - @Test - void testJPath() { + void testJPath() throws IOException { - MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(conf, json); + DedupConfig conf = DedupConfig + .load(IOUtils.toString(getClass().getResourceAsStream("dedup_conf_organization.json"))); - Assertions.assertNotNull(d); - Assertions.assertTrue(StringUtils.isNotBlank(d.getIdentifier())); + final String org = IOUtils.toString(getClass().getResourceAsStream("organization.json")); - System.out.println("d = " + d); + Row row = SparkModel.apply(conf).rowFromJson(org); + + Assertions.assertNotNull(row); + Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier"))); } - @Test - void testNull() { - final Object p = null; - - System.out.println((String) p); - - } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/oa/dedup/jpath/dedup_conf_organization.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/oa/dedup/jpath/dedup_conf_organization.json new file mode 100644 index 000000000..726f2b899 --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/oa/dedup/jpath/dedup_conf_organization.json @@ -0,0 +1,269 @@ +{ + "wf" : { + "threshold" : "0.99", + "dedupRun" : "001", + "entityType" : "organization", + "subEntityValue": "organization", + "orderField" : "legalname", + "queueMaxSize" : "2000", + "groupMaxSize" : "50", + "slidingWindowSize" : "200", + "idPath":"$.id", + "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], + "includeChildren" : "true", + "maxIterations": "20" + }, + "pace" : { + "clustering" : [ + { "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, + { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }, + { "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} } + ], + "decisionTree" : { + "start": { + "fields": [ + { + "field": "gridid", + "comparator": "exactMatch", + "weight": 1, + "countIfUndefined": "false", + "params": {} + } + ], + "threshold": 1, + "aggregation": "AVG", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "layer2", + "ignoreUndefined": "false" + }, + "layer2": { + "fields": [ + { + "field": "websiteurl", + "comparator": "domainExactMatch", + "weight": 1, + "countIfUndefined": "false", + "params": {} + }, + { + "field": "country", + "comparator": "exactMatch", + "weight": 1, + "countIfUndefined": "true", + "params": {} + }, + { + "field": "legalname", + "comparator": "numbersMatch", + "weight": 1, + "countIfUndefined": "true", + "params": {} + }, + { + "field": "legalname", + "comparator": "romansMatch", + "weight": 1, + "countIfUndefined": "true", + "params": {} + } + ], + "threshold": 1, + "aggregation": "AND", + "positive": "layer3", + "negative": "NO_MATCH", + "undefined": "layer3", + "ignoreUndefined": "true" + }, + "layer3": { + "fields": [ + { + "field": "legalname", + "comparator": "cityMatch", + "weight": 1.0, + "countIfUndefined": "true", + "params": { + "windowSize": "4" + } + } + ], + "threshold": 0.1, + "aggregation": "AVG", + "positive": "layer4", + "negative": "NO_MATCH", + "undefined": "NO_MATCH", + "ignoreUndefined": "true" + }, + "layer4": { + "fields": [ + { + "field": "legalname", + "comparator": "keywordMatch", + "weight": 1.0, + "countIfUndefined": "true", + "params": { + "windowSize": "4" + } + } + ], + "threshold": 0.7, + "aggregation": "AVG", + "positive": "layer5", + "negative": "NO_MATCH", + "undefined": "layer5", + "ignoreUndefined": "true" + }, + "layer5": { + "fields": [ + { + "field": "legalname", + "comparator": "jaroWinklerNormalizedName", + "weight": 0.9, + "countIfUndefined": "true", + "params": { + "windowSize": "4" + } + }, + { + "field": "legalshortname", + "comparator": "jaroWinklerNormalizedName", + "weight": 0.1, + "countIfUndefined": "false", + "params": { + "windowSize": 4 + } + } + ], + "threshold": 0.9, + "aggregation": "W_MEAN", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "NO_MATCH", + "ignoreUndefined": "true" + } + }, + "model" : [ + { "name" : "country", "type" : "String", "path" : "$.country.classid"}, + { "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"}, + { "name" : "legalname", "type" : "String", "path" : "$.legalname.value" }, + { "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" }, + { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"}, + { "name" : "originalId", "type" : "String", "path" : "$.id" } + ], + "blacklists" : { + "legalname" : [] + }, + "synonyms": { + "key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"], + "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"], + "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"], + "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"], + "key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"], + "key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"], + "key::7": ["college","collegio","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","κολλέγιο"], + "key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"], + "key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"], + "key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"], + "key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"], + "key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"], + "key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"], + "key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"], + "key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"], + "key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"], + "key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"], + "key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"], + "key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"], + "key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"], + "key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"], + "key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"], + "key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"], + "key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"], + "key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"], + "key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"], + "key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"], + "key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"], + "key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"], + "key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"], + "key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"], + "key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"], + "key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"], + "key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"], + "key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"], + "key::36": ["authority","autorità","autorité","авторитет","autoriteit"], + "key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"], + "key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"], + "key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"], + "key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"], + "key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"], + "key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"], + "key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"], + "key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"], + "key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"], + "key::46": ["division","divisione","division","отделение","divisie","τμήμα"], + "key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"], + "key::48": ["promotion","promozione","продвижение","proothisis","forderung"], + "key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"], + "key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"], + "key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik",""], + "key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri",""], + "key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus",""], + "key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia",""], + "key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik",""], + "key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych",""], + "key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne",""], + "key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna",""], + "key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri",""], + "key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline",""], + "key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu",""], + "key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu",""], + "key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid",""], + "key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus",""], + "key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi",""], + "key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia",""], + "key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus",""], + "key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik",""], + "key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline",""], + "key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria",""], + "key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia",""], + "key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek",""], + "key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia",""], + "key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa",""], + "key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline",""], + "key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika",""], + "key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus",""], + "key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus",""], + "key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi",""], + "key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia",""], + "key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline",""], + "key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti",""], + "key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline",""], + "key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon",""], + "key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus",""], + "key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos",""], + "key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia",""], + "key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur",""], + "key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika",""], + "key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel",""], + "key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused",""], + "key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""], + "key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""], + "key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""], + "key::95": ["mechanics", "mechanical", "meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""], + "key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""], + "key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""], + "key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""], + "key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia",""], + "key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia",""], + "key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia",""], + "key::102": ["informatics","informatica","informática","informática","informatica",""], + "key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"], + "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"], + "key::105" : ["state", "stato", "etade", "estado", "statale", "etat", "zustand", "estado"], + "key::106" : ["seminary", "seminario", "seminaire", "seminar"], + "key::107" : ["agricultural forestry", "af", "a f"], + "key::108" : ["agricultural mechanical", "am", "a m"], + "key::109" : ["catholic", "catholique", "katholische", "catolica", "cattolica", "catolico"] + } + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/oa/dedup/jpath/organization.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/oa/dedup/jpath/organization.json new file mode 100644 index 000000000..f3777ec0c --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/oa/dedup/jpath/organization.json @@ -0,0 +1,241 @@ +{ + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.810000002384185791", + "inferenceprovenance": "", + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "sysimport:crosswalk:entityregistry", + "schemeid": "dnet:provenance_actions", + "schemename": "dnet:provenance_actions" + } + }, + "lastupdatetimestamp": 1584960968152, + "id": "20|corda__h2020::9faf23721249f26ac2c16eb857ea1fb9", + "originalId": ["corda__h2020::927957582"], + "collectedfrom": [ + { + "key": "openaire____::corda_h2020", + "value": "CORDA - COmmon Research DAta Warehouse - Horizon 2020", + "dataInfo": null + } + ], + "pid": [], + "dateofcollection": "2016-06-05", + "dateoftransformation": "2019-11-19", + "extraInfo": [], + "oaiprovenance": null, + "legalshortname": { + "value": "Comentor AB", + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.810000002384185791", + "inferenceprovenance": "", + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "sysimport:crosswalk:entityregistry", + "schemeid": "dnet:provenance_actions", + "schemename": "dnet:provenance_actions" + } + } + }, + "legalname": { + "value": "Comentor AB", + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.810000002384185791", + "inferenceprovenance": "", + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "sysimport:crosswalk:entityregistry", + "schemeid": "dnet:provenance_actions", + "schemename": "dnet:provenance_actions" + } + } + }, + "alternativeNames": [], + "websiteurl": { + "value": "http://www.comentor.se", + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.810000002384185791", + "inferenceprovenance": "", + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "sysimport:crosswalk:entityregistry", + "schemeid": "dnet:provenance_actions", + "schemename": "dnet:provenance_actions" + } + } + }, + "logourl": null, + "eclegalbody": { + "value": "false", + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.810000002384185791", + "inferenceprovenance": "", + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "sysimport:crosswalk:entityregistry", + "schemeid": "dnet:provenance_actions", + "schemename": "dnet:provenance_actions" + } + } + }, + "eclegalperson": { + "value": "true", + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.810000002384185791", + "inferenceprovenance": "", + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "sysimport:crosswalk:entityregistry", + "schemeid": "dnet:provenance_actions", + "schemename": "dnet:provenance_actions" + } + } + }, + "ecnonprofit": { + "value": "false", + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.810000002384185791", + "inferenceprovenance": "", + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "sysimport:crosswalk:entityregistry", + "schemeid": "dnet:provenance_actions", + "schemename": "dnet:provenance_actions" + } + } + }, + "ecresearchorganization": { + "value": "false", + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.810000002384185791", + "inferenceprovenance": "", + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "sysimport:crosswalk:entityregistry", + "schemeid": "dnet:provenance_actions", + "schemename": "dnet:provenance_actions" + } + } + }, + "echighereducation": { + "value": "false", + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.810000002384185791", + "inferenceprovenance": "", + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "sysimport:crosswalk:entityregistry", + "schemeid": "dnet:provenance_actions", + "schemename": "dnet:provenance_actions" + } + } + }, + "ecinternationalorganizationeurinterests": { + "value": "false", + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.810000002384185791", + "inferenceprovenance": "", + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "sysimport:crosswalk:entityregistry", + "schemeid": "dnet:provenance_actions", + "schemename": "dnet:provenance_actions" + } + } + }, + "ecinternationalorganization": { + "value": "false", + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.810000002384185791", + "inferenceprovenance": "", + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "sysimport:crosswalk:entityregistry", + "schemeid": "dnet:provenance_actions", + "schemename": "dnet:provenance_actions" + } + } + }, + "ecenterprise": { + "value": "false", + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.810000002384185791", + "inferenceprovenance": "", + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "sysimport:crosswalk:entityregistry", + "schemeid": "dnet:provenance_actions", + "schemename": "dnet:provenance_actions" + } + } + }, + "ecsmevalidated": { + "value": "true", + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.810000002384185791", + "inferenceprovenance": "", + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "sysimport:crosswalk:entityregistry", + "schemeid": "dnet:provenance_actions", + "schemename": "dnet:provenance_actions" + } + } + }, + "ecnutscode": { + "value": "false", + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.810000002384185791", + "inferenceprovenance": "", + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "sysimport:crosswalk:entityregistry", + "schemeid": "dnet:provenance_actions", + "schemename": "dnet:provenance_actions" + } + } + }, + "country": null +} diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/OrcidPropagationJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/OrcidPropagationJobTest.java index 85db7ecf9..40849132e 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/OrcidPropagationJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/OrcidPropagationJobTest.java @@ -19,10 +19,8 @@ import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.cloudera.org.codehaus.jackson.map.jsontype.impl.ClassNameIdResolver; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.PropagationConstant; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Dataset; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala index 9d57e5869..704c9ab5c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala @@ -130,7 +130,7 @@ object SparkCreateInputGraph { val ds: Dataset[T] = spark.read.load(sourcePath).as[T] ds.groupByKey(_.getId) - .reduceGroups { (x, y) => + .reduceGroups { (x: T, y: T) => x.mergeFrom(y) x } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java index 1560fcbd9..cd401c6cb 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java @@ -156,6 +156,11 @@ public class XmlIndexingJob { switch (outputFormat) { case SOLR: final String collection = ProvisionConstants.getCollectionName(format); + + // SparkSolr >= 4 + // com.lucidworks.spark.BatchSizeType bt = com.lucidworks.spark.BatchSizeType.NUM_DOCS; + // SolrSupport.indexDocs(zkHost, collection, batchSize, bt, docs.rdd()); + // SparkSolr < 4 SolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd()); break; case HDFS: diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java index e88b49de4..19d889d90 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java @@ -31,7 +31,6 @@ import org.dom4j.Node; import org.dom4j.io.OutputFormat; import org.dom4j.io.SAXReader; import org.dom4j.io.XMLWriter; -import org.json4s.Xml; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Joiner;