diff --git a/dhp-build/dhp-build-properties-maven-plugin/pom.xml b/dhp-build/dhp-build-properties-maven-plugin/pom.xml
index e5e9d7f..2b5aa91 100644
--- a/dhp-build/dhp-build-properties-maven-plugin/pom.xml
+++ b/dhp-build/dhp-build-properties-maven-plugin/pom.xml
@@ -10,6 +10,7 @@
dhp-build-properties-maven-plugin
+ 4.1.13-SNAPSHOT
maven-plugin
This module is a maven plugin implementing custom properties substitutions in the build lifecycle
@@ -19,16 +20,19 @@
org.apache.maven
maven-plugin-api
3.6.3
+ provided
org.apache.maven
maven-project
2.2.1
+ provided
org.apache.maven
maven-artifact
2.2.1
+ provided
@@ -100,6 +104,29 @@
+
+
+
+
+ org.apache.maven.plugins
+ maven-plugin-plugin
+ 3.2
+
+ true
+
+
+
+ mojo-descriptor
+ process-classes
+
+ descriptor
+
+
+
+
+
+
+
diff --git a/dhp-build/dhp-build-properties-maven-plugin/test.properties b/dhp-build/dhp-build-properties-maven-plugin/test.properties
index d17c4d8..e39cb36 100644
--- a/dhp-build/dhp-build-properties-maven-plugin/test.properties
+++ b/dhp-build/dhp-build-properties-maven-plugin/test.properties
@@ -1 +1,2 @@
-# Tue Apr 19 15:27:59 CEST 2022
+# Sat Apr 15 10:38:57 CEST 2023
+projectPropertyKey=projectPropertyValue
diff --git a/dnet-dedup-test/job-override.properties b/dnet-dedup-test/job-override.properties
index 309e615..0f4866d 100644
--- a/dnet-dedup-test/job-override.properties
+++ b/dnet-dedup-test/job-override.properties
@@ -1,12 +1,6 @@
-#entitiesPath = /tmp/publications_test_dump
-#entitiesPath = /user/michele.debonis/raw_graph_for_testing/publication
-#workingPath = /user/michele.debonis/new_dedup_test/workingdirtree
-#dedupConfPath = /user/michele.debonis/new_dedup_test/pubs.tree.conf.json
-#numPartitions = 8000
-#useTree = false
-
useTree = true
-numPartitions = 1
-dedupConfPath = /user/michele.debonis/authors_dedup_test/auth.tree.conf.json
-workingPath = /user/michele.debonis/authors_dedup_test/workingdir
-entitiesPath = /user/michele.debonis/authors_dedup_test/authors-scad-zbmath-1.json
\ No newline at end of file
+entitiesPath = /user/michele.debonis/lda_experiments/authors_pubmed
+workingPath = /user/michele.debonis/authors_dedup/gt2_dedup
+numPartitions = 1000
+dedupConfPath = /user/michele.debonis/lda_experiments/authors.fdup.gt2.conf.json
+groundTruthFieldJPath = $.orcid
\ No newline at end of file
diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java b/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java
index 5d5845e..88370a3 100644
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java
@@ -57,14 +57,13 @@ public class Deduper implements Serializable {
.reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize));
}
- public static Iterator> ccToMergeRel(ConnectedComponent cc, DedupConfig dedupConf) {
- return cc
- .getDocs()
+ public static Iterator> ccToMergeRel(Tuple2> cc, DedupConfig dedupConf) {
+ return cc._2()
.stream()
.flatMap(
id -> {
List> tmp = new ArrayList<>();
- tmp.add(new Tuple2<>(cc.getCcId(), id));
+ tmp.add(new Tuple2<>(cc._1(), id));
return tmp.stream();
})
.iterator();
@@ -144,13 +143,12 @@ public class Deduper implements Serializable {
.javaRDD()
.map(Relation::toEdgeRdd);
- JavaRDD ccs = JavaGraphProcessor
- .findCCs(vertexes, edgeRdd, maxIterations)
- .toJavaRDD();
+ JavaPairRDD> ccs = JavaGraphProcessor
+ .findCCs(vertexes, edgeRdd, dedupConf.getWf().getMaxIterations());
JavaRDD mergeRel = ccs
- .filter(k -> k.getDocs().size() > 1)
- .flatMap(cc -> ccToMergeRel(cc, dedupConf))
+ .filter(cc -> cc._2().size() > 1)
+ .flatMap(cc -> Deduper.ccToMergeRel(cc, dedupConf))
.map(it -> new Relation(it._1(), it._2(), "mergeRel"));
final Dataset mergeRels = spark
@@ -161,7 +159,7 @@ public class Deduper implements Serializable {
mergeRels.write().mode(SaveMode.Overwrite).parquet(mergeRelsPath);
}
- public static void createDedupEntity(DedupConfig dedupConf, String mergeRelsPath, String entitiesPath, SparkSession spark, String dedupEntityPath){
+ public static void createDedupEntity(DedupConfig dedupConf, String simRelsPath, String mergeRelsPath, String entitiesPath, SparkSession spark, String dedupEntityPath){
JavaPairRDD entities = spark
.read()
@@ -172,7 +170,15 @@ public class Deduper implements Serializable {
.toJavaRDD()
.mapToPair(t -> t);
- //