From e4a29a4513a5021e635991a91b2c24d2faf9f505 Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Thu, 2 Jul 2020 12:36:33 +0200
Subject: [PATCH 1/4] fixed workflow for the import of the claims alone

---
 .../{ => oozie_app}/config-default.xml        |   0
 .../graph/raw_claims/oozie_app/workflow.xml   | 161 +++++++++++++++++
 .../dhp/oa/graph/raw_claims/workflow.xml      | 169 ------------------
 3 files changed, 161 insertions(+), 169 deletions(-)
 rename dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/{ => oozie_app}/config-default.xml (100%)
 create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/workflow.xml
 delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/workflow.xml
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/config-default.xml
similarity index 100%
rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/config-default.xml
rename to dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/config-default.xml
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/workflow.xml
new file mode 100644
index 0000000000..13ec192ef3
--- /dev/null
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/workflow.xml
@@ -0,0 +1,161 @@
+<workflow-app name="import Claims as Graph" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>graphOutputPath</name>
+            <description>the target path to store raw graph</description>
+        </property>
+        <property>
+            <name>reuseContent</name>
+            <value>false</value>
+            <description>should import content from the aggregator or reuse a previous version</description>
+        </property>
+        <property>
+            <name>contentPath</name>
+            <description>path location to store (or reuse) content from the aggregator</description>
+        </property>
+        <property>
+            <name>postgresURL</name>
+            <description>the postgres URL to access to the database</description>
+        </property>
+        <property>
+            <name>postgresUser</name>
+            <description>the user postgres</description>
+        </property>
+        <property>
+            <name>postgresPassword</name>
+            <description>the password postgres</description>
+        </property>
+        <property>
+            <name>dbSchema</name>
+            <value>beta</value>
+            <description>the database schema according to the D-Net infrastructure (beta or production)</description>
+        </property>
+        <property>
+            <name>mongoURL</name>
+            <description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
+        </property>
+        <property>
+            <name>mongoDb</name>
+            <description>mongo database</description>
+        </property>
+        <property>
+            <name>isLookupUrl</name>
+            <description>the address of the lookUp service</description>
+        </property>
+
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
+        </property>
+        <property>
+            <name>oozieActionShareLibForSpark2</name>
+            <description>oozie action sharelib for spark 2.*</description>
+        </property>
+        <property>
+            <name>spark2ExtraListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
+            <description>spark 2.* extra listeners classname</description>
+        </property>
+        <property>
+            <name>spark2SqlQueryExecutionListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
+            <description>spark 2.* sql query execution listeners classname</description>
+        </property>
+        <property>
+            <name>spark2YarnHistoryServerAddress</name>
+            <description>spark 2.* yarn history server address</description>
+        </property>
+        <property>
+            <name>spark2EventLogDir</name>
+            <description>spark 2.* event log dir location</description>
+        </property>
+    </parameters>
+
+    <global>
+        <job-tracker>${jobTracker}</job-tracker>
+        <name-node>${nameNode}</name-node>
+        <configuration>
+            <property>
+                <name>mapreduce.job.queuename</name>
+                <value>${queueName}</value>
+            </property>
+            <property>
+                <name>oozie.launcher.mapred.job.queue.name</name>
+                <value>${oozieLauncherQueueName}</value>
+            </property>
+            <property>
+                <name>oozie.action.sharelib.for.spark</name>
+                <value>${oozieActionShareLibForSpark2}</value>
+            </property>
+        </configuration>
+    </global>
+
+	<start to="ImportDB_claims"/>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <action name="ImportDB_claims">
+        <java>
+            <prepare>
+                <delete path="${contentPath}/db_claims"/>
+            </prepare>
+            <main-class>eu.dnetlib.dhp.oa.graph.raw.MigrateDbEntitiesApplication</main-class>
+            <arg>--hdfsPath</arg><arg>${contentPath}/db_claims</arg>
+            <arg>--postgresUrl</arg><arg>${postgresURL}</arg>
+            <arg>--postgresUser</arg><arg>${postgresUser}</arg>
+            <arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
+            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
+            <arg>--action</arg><arg>claims</arg>
+            <arg>--dbschema</arg><arg>${dbSchema}</arg>
+        </java>
+        <ok to="ImportODF_claims"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="ImportODF_claims">
+        <java>
+            <prepare>
+                <delete path="${contentPath}/odf_claims"/>
+            </prepare>
+            <main-class>eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication</main-class>
+            <arg>-p</arg><arg>${contentPath}/odf_claims</arg>
+            <arg>-mongourl</arg><arg>${mongoURL}</arg>
+            <arg>-mongodb</arg><arg>${mongoDb}</arg>
+            <arg>-f</arg><arg>ODF</arg>
+            <arg>-l</arg><arg>store</arg>
+            <arg>-i</arg><arg>claim</arg>
+        </java>
+        <ok to="ImportOAF_claims"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="ImportOAF_claims">
+        <java>
+            <prepare>
+                <delete path="${contentPath}/oaf_claims"/>
+            </prepare>
+            <main-class>eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication</main-class>
+            <arg>-p</arg><arg>${contentPath}/oaf_claims</arg>
+            <arg>-mongourl</arg><arg>${mongoURL}</arg>
+            <arg>-mongodb</arg><arg>${mongoDb}</arg>
+            <arg>-f</arg><arg>OAF</arg>
+            <arg>-l</arg><arg>store</arg>
+            <arg>-i</arg><arg>claim</arg>
+        </java>
+        <ok to="wait_import"/>
+        <error to="Kill"/>
+    </action>
+
+
+    <end name="End"/>
+</workflow-app>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/workflow.xml
deleted file mode 100644
index 1ac456976d..0000000000
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/workflow.xml
+++ /dev/null
@@ -1,169 +0,0 @@
-<workflow-app name="import Claims as Graph" xmlns="uri:oozie:workflow:0.5">
-    <parameters>
-        <property>
-            <name>migrationClaimsPathStep1</name>
-            <description>the base path to store hdfs file</description>
-        </property>
-        <property>
-            <name>migrationClaimsPathStep2</name>
-            <description>the temporary path to store entities before dispatching</description>
-        </property>
-        <property>
-            <name>migrationClaimsPathStep3</name>
-            <description>the graph Raw base path</description>
-        </property>
-        <property>
-            <name>postgresURL</name>
-            <description>the postgres URL to access to the database</description>
-        </property>
-        <property>
-            <name>postgresUser</name>
-            <description>the user postgres</description>
-        </property>
-        <property>
-            <name>postgresPassword</name>
-            <description>the password postgres</description>
-        </property>
-        <property>
-            <name>mongoURL</name>
-            <description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
-        </property>
-        <property>
-            <name>mongoDb</name>
-            <description>mongo database</description>
-        </property>
-        <property>
-            <name>sparkDriverMemory</name>
-            <description>memory for driver process</description>
-        </property>
-        <property>
-            <name>sparkExecutorMemory</name>
-            <description>memory for individual executor</description>
-        </property>
-        <property>
-            <name>sparkExecutorCores</name>
-            <description>number of cores used by single executor</description>
-        </property>
-    </parameters>
-
-	<start to="ResetWorkingPath"/>
-
-    <kill name="Kill">
-        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-    </kill>
-
-    <action name="ResetWorkingPath">
-        <fs>
-            <delete path='${migrationClaimsPathStep1}'/>
-            <mkdir path='${migrationClaimsPathStep1}'/>
-        </fs>
-        <ok to="ImportDBClaims"/>
-        <error to="Kill"/>
-    </action>
-   
-    <action name="ImportDBClaims">
-        <java>
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <main-class>eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication</main-class>
-            <arg>-p</arg><arg>${migrationClaimsPathStep1}/db_claims</arg>
-            <arg>-pgurl</arg><arg>${postgresURL}</arg>
-            <arg>-pguser</arg><arg>${postgresUser}</arg>
-            <arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
-            <arg>-a</arg><arg>claims</arg>
-        </java>
-		<ok to="ImportODFClaims"/>
-        <error to="Kill"/>
-    </action>
-    
-    <action name="ImportODFClaims">
-        <java>
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
-            <arg>-p</arg><arg>${migrationClaimsPathStep1}/odf_claims</arg>
-            <arg>-mongourl</arg><arg>${mongoURL}</arg>
-            <arg>-mongodb</arg><arg>${mongoDb}</arg>
-            <arg>-f</arg><arg>ODF</arg>
-            <arg>-l</arg><arg>store</arg>
-            <arg>-i</arg><arg>claim</arg>
-        </java>
-        <ok to="ImportOAFClaims"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="ImportOAFClaims">
-        <java>
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
-            <arg>-p</arg><arg>${migrationClaimsPathStep1}/oaf_claims</arg>
-            <arg>-mongourl</arg><arg>${mongoURL}</arg>
-            <arg>-mongodb</arg><arg>${mongoDb}</arg>
-            <arg>-f</arg><arg>OAF</arg>
-            <arg>-l</arg><arg>store</arg>
-            <arg>-i</arg><arg>claim</arg>
-        </java>
-        <ok to="ResetClaimEntities"/>
-        <error to="Kill"/>
-    </action>
-
-	<action name="ResetClaimEntities">
-        <fs>
-            <delete path='${migrationClaimsPathStep2}'/>
-            <mkdir path='${migrationClaimsPathStep2}'/>
-        </fs>
-        <ok to="GenerateClaimEntities"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="GenerateClaimEntities">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>GenerateClaimEntities</name>
-            <class>eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication</class>
-            <jar>dhp-aggregation-${projectVersion}.jar</jar>
-            <spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
-            <arg>-mt</arg> <arg>yarn-cluster</arg>
-            <arg>-s</arg><arg>${migrationClaimsPathStep1}/db_claims,${migrationClaimsPathStep1}/oaf_claims,${migrationClaimsPathStep1}/odf_claims</arg>
-            <arg>-t</arg><arg>${migrationClaimsPathStep2}/claim_entities</arg>
-            <arg>-pgurl</arg><arg>${postgresURL}</arg>
-            <arg>-pguser</arg><arg>${postgresUser}</arg>
-            <arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
-        </spark>
-        <ok to="ResetClaimGraph"/>
-        <error to="Kill"/>
-    </action>
-
-	<action name="ResetClaimGraph">
-        <fs>
-            <delete path='${migrationClaimsPathStep3}'/>
-            <mkdir path='${migrationClaimsPathStep3}'/>
-        </fs>
-        <ok to="GenerateClaimGraph"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="GenerateClaimGraph">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>GenerateClaimGraph</name>
-            <class>eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication</class>
-            <jar>dhp-aggregation-${projectVersion}.jar</jar>
-            <spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
-            <arg>-mt</arg> <arg>yarn-cluster</arg>
-            <arg>-s</arg><arg>${migrationClaimsPathStep2}/claim_entities</arg>
-            <arg>-g</arg><arg>${migrationClaimsPathStep3}</arg>
-        </spark>
-        <ok to="End"/>
-        <error to="Kill"/>
-    </action>  
-
-    <end name="End"/>
-</workflow-app>
\ No newline at end of file

From ed1c7e5d757bd4effec6f69cc12390e46c953519 Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Thu, 2 Jul 2020 12:40:21 +0200
Subject: [PATCH 2/4] fixed workflow for the import of the claims alone

---
 .../dnetlib/dhp/oa/graph/raw_claims/oozie_app/workflow.xml  | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/workflow.xml
index 13ec192ef3..66eaeeb263 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/workflow.xml
@@ -1,9 +1,5 @@
 <workflow-app name="import Claims as Graph" xmlns="uri:oozie:workflow:0.5">
     <parameters>
-        <property>
-            <name>graphOutputPath</name>
-            <description>the target path to store raw graph</description>
-        </property>
         <property>
             <name>reuseContent</name>
             <value>false</value>
@@ -152,7 +148,7 @@
             <arg>-l</arg><arg>store</arg>
             <arg>-i</arg><arg>claim</arg>
         </java>
-        <ok to="wait_import"/>
+        <ok to="End"/>
         <error to="Kill"/>
     </action>
 

From d380b85246bf26398d8e3b1c569a3cf1c5ea71c9 Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Thu, 2 Jul 2020 12:42:13 +0200
Subject: [PATCH 3/4] unit test for the preparation of the relations

---
 .../oa/provision/PrepareRelationsJobTest.java | 29 ++++++++++++-------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJobTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJobTest.java
index c16bbc6fba..528532eddd 100644
--- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJobTest.java
+++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJobTest.java
@@ -1,9 +1,10 @@
 
 package eu.dnetlib.dhp.oa.provision;
 
-import com.fasterxml.jackson.databind.ObjectMapper;
-import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
-import eu.dnetlib.dhp.schema.oaf.Relation;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
 import org.apache.commons.io.FileUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.FilterFunction;
@@ -19,9 +20,10 @@ import org.junit.jupiter.api.io.TempDir;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
+import eu.dnetlib.dhp.schema.oaf.Relation;
 
 public class PrepareRelationsJobTest {
 
@@ -74,14 +76,19 @@ public class PrepareRelationsJobTest {
 					"-maxRelations", String.valueOf(maxRelations)
 				});
 
-		Dataset<Relation> out = spark.read()
-				.parquet(testPath.toString())
-				.as(Encoders.bean(Relation.class))
-				.cache();
+		Dataset<Relation> out = spark
+			.read()
+			.parquet(testPath.toString())
+			.as(Encoders.bean(Relation.class))
+			.cache();
 
 		Assertions.assertEquals(10, out.count());
 
-		Dataset<Row> freq = out.toDF().cube(SUBRELTYPE).count().filter((FilterFunction<Row>) value -> !value.isNullAt(0));
+		Dataset<Row> freq = out
+			.toDF()
+			.cube(SUBRELTYPE)
+			.count()
+			.filter((FilterFunction<Row>) value -> !value.isNullAt(0));
 		long outcome = freq.filter(freq.col(SUBRELTYPE).equalTo(OUTCOME)).collectAsList().get(0).getAs("count");
 		long supplement = freq.filter(freq.col(SUBRELTYPE).equalTo(SUPPLEMENT)).collectAsList().get(0).getAs("count");
 

From 0f77cac4b55d83b2cdc1aa50c0c848f0720c3ed6 Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Thu, 2 Jul 2020 12:43:51 +0200
Subject: [PATCH 4/4] fix: deduper must use queueMaxSize instead of
 groupMaxSize for the block definition

---
 .../src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java
index c72940deb8..180f9f8460 100644
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java
@@ -37,7 +37,7 @@ public class Deduper implements Serializable {
 	public static JavaPairRDD<String, Block> createSortedBlocks(
 		JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
 		final String of = config.getWf().getOrderField();
-		final int maxQueueSize = config.getWf().getGroupMaxSize();
+		final int maxQueueSize = config.getWf().getQueueMaxSize();
 
 		return mapDocs
 			// the reduce is just to be sure that we haven't document with same id