implementation of the mechanism that checks the existance of a mergerel file

2020-03-23 17:13:30 +01:00 · 2020-03-23 17:13:30 +01:00 · f7890a90df
parent c20e179f5a
commit f7890a90df
6 changed files with 137 additions and 94 deletions
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java
@ -4,10 +4,10 @@ import com.fasterxml.jackson.databind.DeserializationFeature;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.schema.oaf.*;
-import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
-import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.util.MapDocumentUtil;
 import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.*;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaPairRDD;
@ -18,13 +18,14 @@ import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SparkSession;
-import org.dom4j.DocumentException;
 import scala.Tuple2;

 import java.io.IOException;

 public class SparkUpdateEntity {

+    final String IDJSONPATH = "$.id";
+
    public static void main(String[] args) throws Exception {
        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkUpdateEntity.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/updateEntity_parameters.json")));
        parser.parseArgument(args);
@ -32,65 +33,82 @@ public class SparkUpdateEntity {
        new SparkUpdateEntity().run(parser);
    }

-    public void run(ArgumentApplicationParser parser) throws ISLookUpException, DocumentException {
+    public boolean mergeRelExists(String basePath, String entity) throws IOException {
+
+        boolean result = false;
+
+        FileSystem fileSystem = FileSystem.get(new Configuration());
+
+        FileStatus[] fileStatuses = fileSystem.listStatus(new Path(basePath));
+
+        for (FileStatus fs : fileStatuses) {
+            if (fs.isDirectory())
+                if (fileSystem.exists(new Path(DedupUtility.createMergeRelPath(basePath, fs.getPath().getName(), entity))))
+                    result = true;
+        }
+
+        return result;
+    }
+
+    public void run(ArgumentApplicationParser parser) throws IOException {

        final String graphBasePath = parser.get("graphBasePath");
        final String workingPath = parser.get("workingPath");
        final String dedupGraphPath = parser.get("dedupGraphPath");
-        final String isLookUpUrl = parser.get("isLookUpUrl");
-        final String actionSetId = parser.get("actionSetId");

        try (SparkSession spark = getSparkSession(parser)) {

            final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

-            for (DedupConfig dedupConf : DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) {
+            //for each entity
+            for (OafEntityType entity: OafEntityType.values()) {

-                String subEntity = dedupConf.getWf().getSubEntityValue();
+                JavaRDD<String> sourceEntity = sc.textFile(DedupUtility.createEntityPath(graphBasePath, entity.toString()));

-                final Dataset<Relation> df = spark.read().load(DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity)).as(Encoders.bean(Relation.class));
-                final JavaPairRDD<String, String> mergedIds = df
-                        .where("relClass == 'merges'")
-                        .select(df.col("target"))
-                        .distinct()
-                        .toJavaRDD()
-                        .mapToPair((PairFunction<Row, String, String>) r -> new Tuple2<>(r.getString(0), "d"));
+                if (mergeRelExists(workingPath, entity.toString())) {

-                final JavaRDD<String> sourceEntity = sc.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity));
+                    final Dataset<Relation> rel = spark.read().load(DedupUtility.createMergeRelPath(workingPath, "*", entity.toString())).as(Encoders.bean(Relation.class));

-                final JavaRDD<String> dedupEntity = sc.textFile(DedupUtility.createDedupRecordPath(workingPath, actionSetId, subEntity));
+                    final JavaPairRDD<String, String> mergedIds = rel
+                            .where("relClass == 'merges'")
+                            .select(rel.col("target"))
+                            .distinct()
+                            .toJavaRDD()
+                            .mapToPair((PairFunction<Row, String, String>) r -> new Tuple2<>(r.getString(0), "d"));

-                JavaPairRDD<String, String> entitiesWithId = sourceEntity.mapToPair((PairFunction<String, String, String>) s -> new Tuple2<>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s), s));
+                    final JavaRDD<String> dedupEntity = sc.textFile(DedupUtility.createDedupRecordPath(workingPath, "*", entity.toString()));

-                Class<? extends Oaf> mainClass;
-                switch (subEntity) {
-                    case "publication":
-                        mainClass = Publication.class;
-                        break;
-                    case "dataset":
-                        mainClass = eu.dnetlib.dhp.schema.oaf.Dataset.class;
-                        break;
-                    case "datasource":
-                        mainClass = Datasource.class;
-                        break;
-                    case "software":
-                        mainClass = Software.class;
-                        break;
-                    case "organization":
-                        mainClass = Organization.class;
-                        break;
-                    case "otherresearchproduct":
-                        mainClass = OtherResearchProduct.class;
-                        break;
-                    default:
-                        throw new IllegalArgumentException("Illegal type " + subEntity);
+                    JavaPairRDD<String, String> entitiesWithId = sourceEntity.mapToPair((PairFunction<String, String, String>) s -> new Tuple2<>(MapDocumentUtil.getJPathString(IDJSONPATH, s), s));
+
+                    JavaRDD<String> map = entitiesWithId.leftOuterJoin(mergedIds).map(k -> k._2()._2().isPresent() ? updateDeletedByInference(k._2()._1(), getOafClass(entity)) : k._2()._1());
+                    sourceEntity = map.union(dedupEntity);
                }

-                JavaRDD<String> map = entitiesWithId.leftOuterJoin(mergedIds).map(k -> k._2()._2().isPresent() ? updateDeletedByInference(k._2()._1(), mainClass) : k._2()._1());
-                map.union(dedupEntity).saveAsTextFile(dedupGraphPath + "/" + subEntity, GzipCodec.class);
+                sourceEntity.saveAsTextFile(dedupGraphPath + "/" + entity, GzipCodec.class);
+
            }
        }
+    }

+    public Class<? extends Oaf> getOafClass(OafEntityType className) {
+        switch (className.toString()) {
+            case "publication":
+                return Publication.class;
+            case "dataset":
+                return eu.dnetlib.dhp.schema.oaf.Dataset.class;
+            case "datasource":
+                return Datasource.class;
+            case "software":
+                return Software.class;
+            case "organization":
+                return Organization.class;
+            case "otherresearchproduct":
+                return OtherResearchProduct.class;
+            case "project":
+                return Project.class;
+            default:
+                throw new IllegalArgumentException("Illegal type " + className);
+        }
    }

    private static <T extends Oaf> String updateDeletedByInference(final String json, final Class<T> clazz) {
--- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency.oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency.oozie_app/config-default.xml
--- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency.oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency.oozie_app/workflow.xml
@ -1,12 +1,20 @@
-<workflow-app name="Create Similarity Relations" xmlns="uri:oozie:workflow:0.5">
+<workflow-app name="Build Root Records" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>graphBasePath</name>
            <description>the raw graph base path</description>
        </property>
+        <property>
+            <name>isLookUpUrl</name>
+            <description>the address of the lookUp service</description>
+        </property>
+        <property>
+            <name>actionSetId</name>
+            <description>id of the actionSet</description>
+        </property>
        <property>
            <name>workingPath</name>
-            <description>path for the working directory</description>
+            <description>path of the working directory</description>
        </property>
        <property>
            <name>dedupGraphPath</name>
@ -26,12 +34,41 @@
        </property>
    </parameters>

-    <start to="PropagateRelation"/>
+    <start to="UpdateEntity"/>

    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>

+    <action name="UpdateEntity">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <prepare>
+                <delete path='${dedupGraphPath}'/>
+            </prepare>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>Create Dedup Record</name>
+            <class>eu.dnetlib.dhp.dedup.SparkUpdateEntity</class>
+            <jar>dhp-dedup-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory ${sparkExecutorMemory}
+                --executor-cores ${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
+                --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
+                --conf spark.sql.warehouse.dir="/user/hive/warehouse"
+            </spark-opts>
+            <arg>-mt</arg><arg>yarn-cluster</arg>
+            <arg>--i</arg><arg>${graphBasePath}</arg>
+            <arg>--w</arg><arg>${workingPath}</arg>
+            <arg>--o</arg><arg>${dedupGraphPath}</arg>
+        </spark>
+        <ok to="PropagateRelation"/>
+        <error to="Kill"/>
+    </action>
+
    <action name="PropagateRelation">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <prepare>
--- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml
@ -82,11 +82,13 @@
            <name>Create Dedup Record</name>
            <class>eu.dnetlib.dhp.dedup.SparkCreateDedupRecord</class>
            <jar>dhp-dedup-${projectVersion}.jar</jar>
-            <spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory} --conf
-                spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
-                spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
-                spark.sql.warehouse.dir="/user/hive/warehouse"
+            <spark-opts>
+                --executor-memory ${sparkExecutorMemory}
+                --executor-cores ${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
+                --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
+                --conf spark.sql.warehouse.dir="/user/hive/warehouse"
            </spark-opts>
            <arg>-mt</arg><arg>yarn-cluster</arg>
            <arg>--i</arg><arg>${graphBasePath}</arg>
@ -94,32 +96,6 @@
            <arg>--la</arg><arg>${isLookUpUrl}</arg>
            <arg>--asi</arg><arg>${actionSetId}</arg>
        </spark>
-        <ok to="UpdateEntity"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="UpdateEntity">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Create Dedup Record</name>
-            <class>eu.dnetlib.dhp.dedup.SparkUpdateEntity</class>
-            <jar>dhp-dedup-${projectVersion}.jar</jar>
-            <spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory} --conf
-                spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
-                spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
-                spark.sql.warehouse.dir="/user/hive/warehouse"
-            </spark-opts>
-            <arg>-mt</arg><arg>yarn-cluster</arg>
-            <arg>--i</arg><arg>${graphBasePath}</arg>
-            <arg>--w</arg><arg>${workingPath}</arg>
-            <arg>--la</arg><arg>${isLookUpUrl}</arg>
-            <arg>--asi</arg><arg>${actionSetId}</arg>
-            <arg>--o</arg><arg>${dedupGraphPath}</arg>
-        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
--- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/updateEntity_parameters.json
+++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/updateEntity_parameters.json
@ -18,18 +18,6 @@
  "paramRequired": true
 },
 {
-  "paramName": "la",
-  "paramLongName": "isLookUpUrl",
-  "paramDescriptions": "the url of the lookup service",
-  "paramRequired": true
-},
-{
-  "paramName": "asi",
-  "paramLongName": "actionSetId",
-  "paramDescriptions": "the id of the actionset (orchestrator)",
-  "paramRequired": true
-},
-  {
    "paramName": "o",
    "paramLongName": "dedupGraphPath",
    "paramDescription": "the path of the dedup graph",
--- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java
+++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java
@ -4,6 +4,8 @@ import com.google.common.hash.HashFunction;
 import com.google.common.hash.Hashing;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.*;
 import org.junit.Before;
 import org.junit.Ignore;
 import org.junit.Test;
@ -17,13 +19,14 @@ public class SparkCreateDedupTest {

    @Before
    public void setUp() throws IOException {
-        configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json"));
+//        configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json"));
+        configuration = "";
    }

    @Test
    @Ignore
    public void createSimRelsTest2() throws Exception {
-        SparkCreateSimRels.main(new String[] {
+        SparkCreateSimRels.main(new String[]{
                "-mt", "local[*]",
                "-s", "/Users/miconis/dumps",
                "-e", entity,
@ -40,7 +43,7 @@ public class SparkCreateDedupTest {
    @Ignore
    public void createCCTest() throws Exception {

-        SparkCreateConnectedComponent.main(new String[] {
+        SparkCreateConnectedComponent.main(new String[]{
                "-mt", "local[*]",
                "-s", "/Users/miconis/dumps",
                "-e", entity,
@ -52,7 +55,7 @@ public class SparkCreateDedupTest {
    @Test
    @Ignore
    public void dedupRecordTest() throws Exception {
-        SparkCreateDedupRecord.main(new String[] {
+        SparkCreateDedupRecord.main(new String[]{
                "-mt", "local[*]",
                "-s", "/Users/miconis/dumps",
                "-e", entity,
@ -62,21 +65,42 @@ public class SparkCreateDedupTest {
    }

    @Test
+    @Ignore
    public void printConfiguration() throws Exception {
        System.out.println(ArgumentApplicationParser.compressArgument(configuration));
    }

    @Test
+    @Ignore
    public void testHashCode() {
        final String s1 = "20|grid________::6031f94bef015a37783268ec1e75f17f";
        final String s2 = "20|nsf_________::b12be9edf414df8ee66b4c52a2d8da46";

        final HashFunction hashFunction = Hashing.murmur3_128();

-        System.out.println( s1.hashCode());
+        System.out.println(s1.hashCode());
        System.out.println(hashFunction.hashString(s1).asLong());
-        System.out.println( s2.hashCode());
+        System.out.println(s2.hashCode());
        System.out.println(hashFunction.hashString(s2).asLong());
    }

+    @Test
+    public void fileExistsTest() throws IOException {
+
+        boolean result = false;
+
+        FileSystem fileSystem = FileSystem.get(new Configuration());
+
+        FileStatus[] fileStatuses = fileSystem.listStatus(new Path("/tmp"));
+
+        for (FileStatus fs : fileStatuses) {
+            if (fs.isDirectory()) {
+                if (fileSystem.exists(new Path(DedupUtility.createMergeRelPath("/tmp", fs.getPath().getName(), "cicciopasticcio")))) {
+                    System.out.println("fs = " + DedupUtility.createMergeRelPath("/tmp", fs.getPath().getName(), "cicciopasticcio"));
+                    result = true;
+                }
+            }
+        }
+
+    }
 }