diff --git a/dnet-dedup-test/pom.xml.releaseBackup b/dnet-dedup-test/pom.xml.releaseBackup
new file mode 100644
index 0000000..1e017bd
--- /dev/null
+++ b/dnet-dedup-test/pom.xml.releaseBackup
@@ -0,0 +1,149 @@
+
+
+
+ 4.0.0
+
+
+ eu.dnetlib
+ dnet-dedup
+ 4.0.2-SNAPSHOT
+ ../pom.xml
+
+
+ dnet-dedup-test
+ jar
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-deploy-plugin
+ 2.7
+
+ true
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+
+
+ 1.8
+
+ **/*.java
+
+
+
+
+
+
+
+
+
+ net.alchim31.maven
+ scala-maven-plugin
+ 4.0.1
+
+
+
+
+
+
+
+
+
+
+ scala-compile-first
+ initialize
+
+ add-source
+ compile
+
+
+
+ scala-test-compile
+ process-test-resources
+
+ testCompile
+
+
+
+
+ ${scala.version}
+
+
+
+
+
+
+
+
+
+
+ eu.dnetlib
+ dnet-pace-core
+ ${project.version}
+
+
+
+
+ org.apache.spark
+ spark-core_2.11
+
+
+ org.apache.spark
+ spark-graphx_2.11
+
+
+
+ org.apache.spark
+ spark-sql_2.11
+
+
+
+ junit
+ junit
+ test
+
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+
+
+
+ org.scala-lang
+ scala-library
+
+
+
+
+
\ No newline at end of file
diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/ClusteringTester.java b/dnet-dedup-test/src/main/java/eu/dnetlib/ClusteringTester.java
new file mode 100644
index 0000000..b701363
--- /dev/null
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/ClusteringTester.java
@@ -0,0 +1,115 @@
+package eu.dnetlib;
+
+import eu.dnetlib.pace.config.DedupConfig;
+import eu.dnetlib.pace.model.MapDocument;
+import eu.dnetlib.pace.util.MapDocumentUtil;
+import eu.dnetlib.pace.utils.Utility;
+import eu.dnetlib.support.Block;
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaDoubleRDD;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.DoubleFunction;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.sql.SparkSession;
+import scala.Tuple2;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+
+public class ClusteringTester {
+
+ public static void main(String[] args) throws Exception {
+
+ String configPath = args[0];
+ String entitiesPath = args[1];
+
+ new ClusteringTester()
+ .run(configPath, entitiesPath);
+ }
+
+ public void run(String configPath, String entitiesPath) throws IOException {
+
+ DedupConfig dedupConf = DedupConfig.load(readJson(configPath));
+
+ SparkSession spark = SparkSession
+ .builder()
+ .appName("ClusteringTester")
+ .master("local[*]")
+ .getOrCreate();
+
+ JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
+
+ JavaPairRDD mapDocuments = sc
+ .textFile(entitiesPath)
+ .mapToPair(
+ (PairFunction) s -> {
+ MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s);
+ return new Tuple2<>(d.getIdentifier(), d);
+ });
+
+ long totalRecords = mapDocuments.count();
+
+ // create blocks for deduplication
+ JavaPairRDD blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf);
+
+ //block_key, cardinality, comparisons
+ JavaRDD>> blockStats = blocks.map(b -> new Tuple2<>(b._1(), new Tuple2<>((b._2().elements()), comparisonsNumber(b._2(), dedupConf))));
+
+ Long totalComparisons = blockStats.map(b -> b._2()._2()).reduce((a, b) -> a + b);
+
+ Long blocksNumber = blockStats.count();
+
+ JavaDoubleRDD blockSizeRDD = blockStats.mapToDouble(b -> Double.parseDouble(b._2()._1().toString()));
+
+ Double maxBlockSize = blockSizeRDD.max();
+
+ double[] buckets = new double[(int) (maxBlockSize/10 + 3)];
+
+ double bucketSize = 10.0;
+
+ double bucketBase = 0.0;
+ for (int i=0; i < buckets.length; i++) {
+ buckets[i] = bucketBase;
+ bucketBase += bucketSize;
+ }
+
+ long[] histogram = blockSizeRDD.histogram(buckets);
+
+ System.out.println("b | n");
+ for (int i=0; i< histogram.length; i++) {
+ System.out.println(buckets[i] + " | " + histogram[i]);
+ }
+
+ System.out.println("max block size = " + maxBlockSize);
+ System.out.println("number of records = " + totalRecords);
+ System.out.println("number of blocks = " + blocksNumber);
+ System.out.println("total number of comparisons = " + totalComparisons);
+ }
+
+ //compute the number of comparisons considering the sliding window
+ public static Long comparisonsNumber(Block b, DedupConfig dedupConfig){
+ long blockSize = b.elements();
+ long slidingWindowSize = dedupConfig.getWf().getSlidingWindowSize();
+ if (slidingWindowSize >= blockSize)
+ return ((slidingWindowSize*(slidingWindowSize-1))/2);
+ return (blockSize-slidingWindowSize+1)*((slidingWindowSize*(slidingWindowSize-1))/2);
+ }
+
+ public String readJson(String fileName) throws IOException {
+ BufferedReader reader = new BufferedReader(new FileReader(fileName));
+ StringBuilder stringBuilder = new StringBuilder();
+ char[] buffer = new char[10];
+ while (reader.read(buffer) != -1) {
+ stringBuilder.append(new String(buffer));
+ buffer = new char[10];
+ }
+ reader.close();
+
+ return stringBuilder.toString();
+ }
+
+}
diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java
index 125d375..2e08a69 100644
--- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java
+++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java
@@ -1,5 +1,6 @@
package eu.dnetlib.pace;
+import eu.dnetlib.ClusteringTester;
import eu.dnetlib.Deduper;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.Field;
@@ -8,16 +9,19 @@ import eu.dnetlib.pace.tree.support.TreeProcessor;
import eu.dnetlib.pace.tree.support.TreeStats;
import eu.dnetlib.pace.util.MapDocumentUtil;
import eu.dnetlib.pace.utils.Utility;
+import eu.dnetlib.support.Block;
import eu.dnetlib.support.ConnectedComponent;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.SparkSession;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import scala.Tuple2;
+import java.io.IOException;
import java.net.URL;
import java.util.List;
import java.util.Map;
diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.current.conf.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.current.conf.json
index 31b200c..8e3b90e 100644
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.current.conf.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.current.conf.json
@@ -143,10 +143,10 @@
}
},
"model" : [
- { "name" : "country", "type" : "String", "path" : "$.country.classid"},
- { "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"},
- { "name" : "legalname", "type" : "String", "path" : "$.legalname.value" },
- { "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" },
+ { "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"},
+ { "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"},
+ { "name" : "legalname", "type" : "String", "path" : "$.organization.metadata.legalname.value" },
+ { "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" },
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"},
{ "name" : "originalId", "type" : "String", "path" : "$.id" }
],
diff --git a/dnet-pace-core/pom.xml.releaseBackup b/dnet-pace-core/pom.xml.releaseBackup
new file mode 100644
index 0000000..0d18645
--- /dev/null
+++ b/dnet-pace-core/pom.xml.releaseBackup
@@ -0,0 +1,73 @@
+
+
+
+ 4.0.0
+
+
+ eu.dnetlib
+ dnet-dedup
+ 4.0.2-SNAPSHOT
+ ../pom.xml
+
+
+ dnet-pace-core
+ jar
+
+
+
+ edu.cmu
+ secondstring
+
+
+ com.google.guava
+ guava
+
+
+ com.google.code.gson
+ gson
+
+
+ org.apache.commons
+ commons-lang3
+
+
+ commons-io
+ commons-io
+
+
+
+ org.antlr
+ stringtemplate
+
+
+ commons-logging
+ commons-logging
+
+
+ junit
+ junit
+ test
+
+
+ org.reflections
+ reflections
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+
+
+ org.apache.commons
+ commons-math3
+
+
+
+ com.jayway.jsonpath
+ json-path
+
+
+
+
+
+
+
diff --git a/pom.xml.releaseBackup b/pom.xml.releaseBackup
new file mode 100644
index 0000000..d840af1
--- /dev/null
+++ b/pom.xml.releaseBackup
@@ -0,0 +1,333 @@
+
+
+
+ 4.0.0
+
+ eu.dnetlib
+ dnet-dedup
+ 4.0.2-SNAPSHOT
+
+ pom
+
+ http://www.d-net.research-infrastructures.eu
+
+
+
+ The Apache Software License, Version 2.0
+ http://www.apache.org/licenses/LICENSE-2.0.txt
+ repo
+ A business-friendly OSS license
+
+
+
+
+ scm:git:https://code-repo.d4science.org/D-Net/dnet-dedup.git
+ HEAD
+
+
+
+ dnet-pace-core
+ dnet-dedup-test
+
+
+
+ Redmine
+ https://issue.openaire.research-infrastructures.eu/projects/openaire
+
+
+
+
+
+
+ dnet45-releases
+ D-Net 45 Releases
+ http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases
+ default
+
+
+
+
+
+ dnet-deps
+ dnet-dependencies
+ http://maven.research-infrastructures.eu/nexus/content/repositories/dnet-deps
+ default
+
+
+ dnet45-releases
+ D-Net 45 Releases
+ http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases
+ default
+
+ true
+
+
+
+ dnet45-snapshots
+ D-Net 45 Snapshots
+ http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots
+ default
+
+ true
+
+
+
+
+ cloudera
+ Cloudera Repository
+ https://repository.cloudera.com/artifactory/cloudera-repos
+
+ true
+
+
+ false
+
+
+
+
+ target
+ target/classes
+ ${project.artifactId}-${project.version}
+ target/test-classes
+
+
+
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.6.0
+
+
+ 1.8
+ ${project.build.sourceEncoding}
+
+
+
+
+ org.apache.maven.plugins
+ maven-jar-plugin
+ 3.0.2
+
+
+
+ org.apache.maven.plugins
+ maven-source-plugin
+ 3.0.1
+
+
+ attach-sources
+ verify
+
+ jar-no-fork
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+ 2.19.1
+
+ true
+
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+ 2.10.4
+
+ true
+
+
+
+
+ org.apache.maven.plugins
+ maven-dependency-plugin
+ 3.0.0
+
+
+
+ org.apache.maven.plugins
+ maven-failsafe-plugin
+ 2.13
+
+
+ integration-test
+
+ integration-test
+
+
+
+ verify
+
+ verify
+
+
+
+
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-release-plugin
+ 2.5.3
+
+
+
+
+
+
+
+
+ edu.cmu
+ secondstring
+ 1.0.0
+
+
+ org.antlr
+ stringtemplate
+ 3.2
+
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+ ${jackson.version}
+
+
+
+ com.fasterxml.jackson.dataformat
+ jackson-dataformat-xml
+ ${jackson.version}
+
+
+ com.fasterxml.jackson.module
+ jackson-module-jsonSchema
+ ${jackson.version}
+
+
+
+
+
+ org.apache.commons
+ commons-math3
+ 3.6.1
+
+
+
+ com.google.guava
+ guava
+ ${google.guava.version}
+
+
+ com.google.code.gson
+ gson
+ ${google.gson.version}
+
+
+
+ org.apache.commons
+ commons-lang3
+ ${commons.lang.version}
+
+
+
+ commons-io
+ commons-io
+ ${commons.io.version}
+
+
+ commons-collections
+ commons-collections
+ ${commons.collections.version}
+
+
+ commons-logging
+ commons-logging
+ ${commons.logging.version}
+
+
+ org.apache.spark
+ spark-core_2.11
+ ${spark.version}
+ provided
+
+
+ org.apache.spark
+ spark-graphx_2.11
+ ${spark.version}
+ provided
+
+
+ org.apache.spark
+ spark-sql_2.11
+ ${spark.version}
+ provided
+
+
+ junit
+ junit
+ ${junit.version}
+ test
+
+
+ org.reflections
+ reflections
+ 0.9.10
+
+
+
+ org.scala-lang
+ scala-library
+ ${scala.version}
+
+
+
+ org.apache.oozie
+ oozie-client
+ 5.1.0
+
+
+ com.jayway.jsonpath
+ json-path
+ 2.4.0
+
+
+
+
+
+
+
+
+ UTF-8
+ UTF-8
+
+ 2.2.2
+ 15.0
+
+ 2.2.0
+ 2.6.6
+
+ 3.5
+ 2.4
+ 3.2.1
+ 1.1.3
+
+ 4.9
+ 2.11.8
+
+ false
+
+
diff --git a/release.properties b/release.properties
new file mode 100644
index 0000000..70a3c40
--- /dev/null
+++ b/release.properties
@@ -0,0 +1,22 @@
+#release configuration
+#Thu Jul 02 17:06:39 CEST 2020
+scm.commentPrefix=[maven-release-plugin]
+pushChanges=true
+project.rel.eu.dnetlib\:dnet-dedup-test=4.0.2
+scm.tag=dnet-dedup-4.0.2
+remoteTagging=true
+project.scm.eu.dnetlib\:dnet-dedup-test.empty=true
+projectVersionPolicyId=default
+scm.url=scm\:git\:https\://code-repo.d4science.org/D-Net/dnet-dedup.git
+scm.tagNameFormat=@{project.artifactId}-@{project.version}
+project.rel.eu.dnetlib\:dnet-dedup=4.0.2
+project.dev.eu.dnetlib\:dnet-pace-core=4.0.3-SNAPSHOT
+preparationGoals=clean verify
+project.scm.eu.dnetlib\:dnet-dedup.tag=HEAD
+project.scm.eu.dnetlib\:dnet-dedup.developerConnection=scm\:git\:https\://code-repo.d4science.org/D-Net/dnet-dedup.git
+exec.snapshotReleasePluginAllowed=false
+project.dev.eu.dnetlib\:dnet-dedup=4.0.3-SNAPSHOT
+project.scm.eu.dnetlib\:dnet-pace-core.empty=true
+project.dev.eu.dnetlib\:dnet-dedup-test=4.0.3-SNAPSHOT
+completedPhase=end-release
+project.rel.eu.dnetlib\:dnet-pace-core=4.0.2