diff --git a/dnet-dedup-test/dependency-reduced-pom.xml b/dnet-dedup-test/dependency-reduced-pom.xml
new file mode 100644
index 0000000..07b9268
--- /dev/null
+++ b/dnet-dedup-test/dependency-reduced-pom.xml
@@ -0,0 +1,119 @@
+
+
+
+ dnet-dedup
+ eu.dnetlib
+ 3.0.14-SNAPSHOT
+
+ 4.0.0
+ dnet-dedup-test
+
+
+
+ maven-shade-plugin
+ 2.4.3
+
+
+ package
+
+ shade
+
+
+
+
+ *:*
+
+ META-INF/*.SF
+ META-INF/*.DSA
+ META-INF/*.RSA
+
+
+
+
+
+
+
+
+ maven-deploy-plugin
+ 2.7
+
+ true
+
+
+
+ maven-compiler-plugin
+
+
+ 1.8
+
+ **/*.java
+
+
+
+
+ net.alchim31.maven
+ scala-maven-plugin
+ 4.0.1
+
+
+ scala-compile-first
+ initialize
+
+ add-source
+ compile
+
+
+
+ scala-test-compile
+ process-test-resources
+
+ testCompile
+
+
+
+
+ ${scala.version}
+
+
+
+
+
+
+ junit
+ junit
+ 4.9
+ test
+
+
+ hamcrest-core
+ org.hamcrest
+
+
+
+
+ org.apache.oozie
+ oozie-client
+ 5.1.0
+ test
+
+
+ json-simple
+ com.googlecode.json-simple
+
+
+ jms
+ javax.jms
+
+
+ slf4j-simple
+ org.slf4j
+
+
+ oozie-fluent-job-api
+ org.apache.oozie
+
+
+
+
+
+
diff --git a/dnet-dedup-test/pom.xml b/dnet-dedup-test/pom.xml
index 601de65..e5d429b 100644
--- a/dnet-dedup-test/pom.xml
+++ b/dnet-dedup-test/pom.xml
@@ -15,6 +15,33 @@
+
+
+ org.apache.maven.plugins
+ maven-shade-plugin
+ 2.4.3
+
+
+ package
+
+ shade
+
+
+
+
+ *:*
+
+ META-INF/*.SF
+ META-INF/*.DSA
+ META-INF/*.RSA
+
+
+
+
+
+
+
+
org.apache.maven.plugins
maven-deploy-plugin
diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/Block.java b/dnet-dedup-test/src/main/java/eu/dnetlib/Block.java
new file mode 100644
index 0000000..5d8aa98
--- /dev/null
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/Block.java
@@ -0,0 +1,50 @@
+package eu.dnetlib;
+
+import eu.dnetlib.pace.model.MapDocument;
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
+
+public class Block implements Serializable {
+
+ String key;
+ List elements;
+
+ public Block(String key, Iterable elements){
+ this.key = key;
+ this.elements = StreamSupport.stream(elements.spliterator(), false).collect(Collectors.toList());
+ }
+
+ public Block(String key, List elements){
+ this.key = key;
+ this.elements = elements;
+ }
+
+ public String getKey() {
+ return key;
+ }
+
+ public void setKey(String key) {
+ this.key = key;
+ }
+
+ public List getElements() {
+ return elements;
+ }
+
+ public void setElements(List elements) {
+ this.elements = elements;
+ }
+
+ public int comparisons(){
+ int size = elements.size();
+ return (size*(size-1)/2);
+ }
+
+ public int elements(){
+ return elements.size();
+ }
+}
diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java
index 8a1783a..851c82e 100644
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java
@@ -1,12 +1,12 @@
package eu.dnetlib;
-import com.google.common.collect.Iterables;
import eu.dnetlib.graph.GraphProcessor;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.utils.PaceUtils;
import eu.dnetlib.reporter.SparkBlockProcessor;
import eu.dnetlib.reporter.SparkReporter;
+
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
@@ -17,13 +17,15 @@ import org.apache.spark.util.LongAccumulator;
import scala.Tuple2;
import java.net.URL;
-import java.util.Map;
+import java.util.*;
import java.util.stream.Collectors;
public class SparkLocalTest {
public static void main(String[] args) {
+ double startTime = System.currentTimeMillis();
+
final SparkSession spark = SparkSession
.builder()
.appName("Deduplication")
@@ -33,7 +35,7 @@ public class SparkLocalTest {
final JavaSparkContext context = new JavaSparkContext(spark.sparkContext());
final URL dataset = SparkLocalTest.class.getResource("/eu/dnetlib/pace/organization.to.fix.json");
- final JavaRDD dataRDD = context.textFile(dataset.getPath());
+ JavaRDD dataRDD = context.textFile(dataset.getPath());
//read the configuration from the classpath
final DedupConfig config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/org.curr.conf", SparkLocalTest.class));
@@ -46,32 +48,24 @@ public class SparkLocalTest {
return new Tuple2<>(mapDocument.getIdentifier(), mapDocument);
});
-// mapDocs.foreach(doc -> System.out.println("doc = " + doc._2().getFieldMap().get("legalname")));
-
-// mapDocs.filter(d -> d._2().getFieldMap().get("doi").stringValue().length() > 0).foreach(d -> System.out.println(d));
-// mapDocs.filter(d -> d._2().getFieldMap().get("documentationUrl").stringValue().length() > 0).foreach(d -> System.out.println(d));
+// System.out.println("mapDocs = " + mapDocs.count());
RDD> vertexes = mapDocs.mapToPair(t -> new Tuple2