merge branch with master

2020-11-05 16:31:18 +01:00 · 2020-11-05 16:31:18 +01:00 · f8e9bda24c
parent 7ebdfacee9 afa0b1489b
commit f8e9bda24c
8 changed files with 267 additions and 165 deletions
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/MakeTarArchive.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MakeTarArchive.java
@ -1,14 +1,15 @@
-package eu.dnetlib.dhp.common;

-import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
-import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
-import org.apache.hadoop.fs.*;
+package eu.dnetlib.dhp.common;

 import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Serializable;

+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
+import org.apache.hadoop.fs.*;
+
 public class MakeTarArchive implements Serializable {

 	private static TarArchiveOutputStream getTar(FileSystem fileSystem, String outputPath) throws IOException {
@ -113,7 +114,4 @@ public class MakeTarArchive implements Serializable {
 		return current_size;
 	}

-
-
-
 }
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java
@ -9,7 +9,6 @@ import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;

-
@Disabled
 public class HttpConnectorTest {

--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/oozie_app/workflow.xml
@ -320,6 +320,7 @@
        <ok to="join_extend"/>
        <error to="Kill"/>
    </action>
+
    <action name="extend_orp">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
@ -344,6 +345,7 @@
        <ok to="join_extend"/>
        <error to="Kill"/>
    </action>
+
    <action name="extend_software">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
@ -407,7 +409,6 @@
        <error to="Kill"/>
    </action>

-
    <action name="send_zenodo">
        <java>
            <main-class>eu.dnetlib.dhp.oa.graph.dump.SendToZenodoHDFS</main-class>
@ -424,8 +425,6 @@
        <error to="Kill"/>
    </action>

-
-
    <end name="End"/>

 </workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump_whole/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump_whole/oozie_app/workflow.xml
@ -357,10 +357,8 @@
        <error to="Kill"/>
    </action>

-
    <join name="join_dump" to="fork_context"/>

-
    <fork name="fork_context">
        <path start="create_entities_fromcontext"/>
        <path start="create_relation_fromcontext"/>
@ -389,7 +387,6 @@
        <error to="Kill"/>
    </action>

-
    <action name="create_relation_fromorgs">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
@ -418,7 +415,6 @@

    <join name="join_context" to="fork_extract_relations"/>

-
    <fork name="fork_extract_relations">
        <path start="rels_from_pubs"/>
        <path start="rels_from_dats"/>
@ -530,7 +526,6 @@
        <error to="Kill"/>
    </action>

-
    <join name="join_extract_relations" to="collect_and_save"/>

    <action name="collect_and_save">
@ -569,7 +564,6 @@
        <error to="Kill"/>
    </action>
    
-
    <action name="send_zenodo">
        <java>
            <main-class>eu.dnetlib.dhp.oa.graph.dump.SendToZenodoHDFS</main-class>
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateScholixIndex.scala
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateScholixIndex.scala
@ -6,11 +6,36 @@ import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary
 import eu.dnetlib.dhp.schema.oaf.Relation
 import org.apache.commons.io.IOUtils
 import org.apache.spark.SparkConf
+import org.apache.spark.sql.expressions.Aggregator
 import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}

 object SparkGenerateScholixIndex {


+
+  def getScholixAggregator(): Aggregator[(String, Scholix), Scholix, Scholix] = new Aggregator[(String, Scholix), Scholix, Scholix]{
+
+    override def zero: Scholix = new Scholix()
+
+    override def reduce(b: Scholix, a: (String, Scholix)): Scholix = {
+      b.mergeFrom(a._2)
+      b
+    }
+
+    override def merge(wx: Scholix, wy: Scholix): Scholix = {
+      wx.mergeFrom(wy)
+      wx
+    }
+    override def finish(reduction: Scholix): Scholix = reduction
+
+    override def bufferEncoder: Encoder[Scholix] =
+      Encoders.kryo(classOf[Scholix])
+
+    override def outputEncoder: Encoder[Scholix] =
+      Encoders.kryo(classOf[Scholix])
+  }
+
+
  def main(args: Array[String]): Unit = {
    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkGenerateScholixIndex.getClass.getResourceAsStream("/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json")))
    parser.parseArgument(args)
@ -40,7 +65,7 @@ object SparkGenerateScholixIndex {

        (relation.getTarget, Scholix.generateScholixWithSource(summary,relation))

-      }).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/scholix_source")
+      }).repartition(6000).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/scholix_source")

    val sTarget:Dataset[(String,Scholix)] = spark.read.load(s"$workingDirPath/scholix_source").as[(String, Scholix)]

@ -53,9 +78,16 @@ object SparkGenerateScholixIndex {
      scholix.generateIdentifier()
      scholix.generatelinkPublisher()
      scholix
-    }).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/scholix")
+    }).repartition(6000).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/scholix_r")


+    val finalScholix:Dataset[Scholix] = spark.read.load(s"$workingDirPath/scholix_r").as[Scholix]
+
+    finalScholix.map(d => (d.getIdentifier, d))(Encoders.tuple(Encoders.STRING, scholixEncoder))
+      .groupByKey(_._1)(Encoders.STRING)
+      .agg(getScholixAggregator().toColumn)
+      .map(p => p._2)
+      .write.mode(SaveMode.Overwrite).save(s"$workingDirPath/scholix")

  }

--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/Scholix.java
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/Scholix.java
@ -5,6 +5,8 @@ import java.io.Serializable;
 import java.util.*;
 import java.util.stream.Collectors;

+import org.apache.commons.lang3.StringUtils;
+
 import com.fasterxml.jackson.databind.ObjectMapper;

 import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
@ -91,13 +93,91 @@ public class Scholix implements Serializable {
 		s.setSource(ScholixResource.fromSummary(scholixSummary));

 		s.setIdentifier(rel.getTarget());
-		// ScholixResource mockTarget = new ScholixResource();
-		// mockTarget.setDnetIdentifier(rel.getTarget());
-		// s.setTarget(mockTarget);
-		// s.generateIdentifier();
 		return s;
 	}

+	private List<ScholixEntityId> mergeScholixEntityId(final List<ScholixEntityId> a, final List<ScholixEntityId> b) {
+		final List<ScholixEntityId> m = new ArrayList<>(a);
+		if (b != null)
+			b.forEach(s -> {
+				int tt = (int) m.stream().filter(t -> t.getName().equalsIgnoreCase(s.getName())).count();
+				if (tt == 0) {
+					m.add(s);
+				}
+			});
+		return m;
+	}
+
+	private List<ScholixIdentifier> mergeScholixIdnetifier(final List<ScholixIdentifier> a,
+		final List<ScholixIdentifier> b) {
+		final List<ScholixIdentifier> m = new ArrayList<>(a);
+		if (b != null)
+			b.forEach(s -> {
+				int tt = (int) m.stream().filter(t -> t.getIdentifier().equalsIgnoreCase(s.getIdentifier())).count();
+				if (tt == 0) {
+					m.add(s);
+				}
+			});
+		return m;
+	}
+
+	private List<ScholixCollectedFrom> mergeScholixCollectedFrom(final List<ScholixCollectedFrom> a,
+		final List<ScholixCollectedFrom> b) {
+		final List<ScholixCollectedFrom> m = new ArrayList<>(a);
+		if (b != null)
+			b.forEach(s -> {
+				int tt = (int) m
+					.stream()
+					.filter(t -> t.getProvider().getName().equalsIgnoreCase(s.getProvider().getName()))
+					.count();
+				if (tt == 0) {
+					m.add(s);
+				}
+			});
+		return m;
+	}
+
+	private ScholixRelationship mergeRelationships(final ScholixRelationship a, final ScholixRelationship b) {
+		ScholixRelationship result = new ScholixRelationship();
+		result.setName(StringUtils.isEmpty(a.getName()) ? b.getName() : a.getName());
+		result.setInverse(StringUtils.isEmpty(a.getInverse()) ? b.getInverse() : a.getInverse());
+		result.setSchema(StringUtils.isEmpty(a.getSchema()) ? b.getSchema() : a.getSchema());
+		return result;
+	}
+
+	private ScholixResource mergeResource(final ScholixResource a, final ScholixResource b) {
+
+		final ScholixResource result = new ScholixResource();
+		result.setCollectedFrom(mergeScholixCollectedFrom(a.getCollectedFrom(), b.getCollectedFrom()));
+		result.setCreator(mergeScholixEntityId(a.getCreator(), b.getCreator()));
+		result
+			.setDnetIdentifier(
+				StringUtils.isBlank(a.getDnetIdentifier()) ? b.getDnetIdentifier() : a.getDnetIdentifier());
+		result.setIdentifier(mergeScholixIdnetifier(a.getIdentifier(), b.getIdentifier()));
+		result.setObjectType(StringUtils.isNotBlank(a.getObjectType()) ? a.getObjectType() : b.getObjectType());
+		result
+			.setObjectSubType(
+				StringUtils.isNotBlank(a.getObjectSubType()) ? a.getObjectSubType() : b.getObjectSubType());
+		result.setPublisher(mergeScholixEntityId(a.getPublisher(), b.getPublisher()));
+		result
+			.setPublicationDate(
+				StringUtils.isNotBlank(a.getPublicationDate()) ? a.getPublicationDate() : b.getPublicationDate());
+		result.setTitle(StringUtils.isNotBlank(a.getTitle()) ? a.getTitle() : b.getTitle());
+		return result;
+
+	}
+
+	public void mergeFrom(final Scholix other) {
+		linkprovider = mergeScholixEntityId(linkprovider, other.getLinkprovider());
+		publisher = mergeScholixEntityId(publisher, other.getPublisher());
+		if (StringUtils.isEmpty(publicationDate))
+			publicationDate = other.getPublicationDate();
+		relationship = mergeRelationships(relationship, other.getRelationship());
+		source = mergeResource(source, other.getSource());
+		target = mergeResource(target, other.getTarget());
+		generateIdentifier();
+	}
+
 	public void generatelinkPublisher() {
 		Set<String> publisher = new HashSet<>();
 		if (source.getPublisher() != null)
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/provision/oozie_app/workflow.xml
@ -108,7 +108,7 @@
            <arg>-m</arg> <arg>yarn-cluster</arg>
            <arg>--workingPath</arg><arg>${workingDirPath}</arg>
        </spark>
-        <ok to="DropAndCreateIndex"/>
+        <ok to="End"/>
        <error to="Kill"/>
    </action>