[Organization] dump for organizations only

2024-02-06 14:13:12 +02:00 · 2024-02-06 14:13:12 +02:00 · e4b56a4f88
parent 3ad0d6edfc
commit e4b56a4f88
5 changed files with 393 additions and 1 deletions
--- a/dump-schema/src/test/java/GenerateJsonSchema.java
+++ b/dump-schema/src/test/java/GenerateJsonSchema.java
@ -1,6 +1,5 @@
 import java.io.IOException;

-import eu.dnetlib.dhp.oa.model.Result;
 import org.junit.jupiter.api.Test;

 import com.fasterxml.jackson.core.JsonProcessingException;
@ -10,6 +9,7 @@ import com.github.imifou.jsonschema.module.addon.AddonModule;
 import com.github.victools.jsonschema.generator.*;

 import eu.dnetlib.dhp.ExecCreateSchemas;
+import eu.dnetlib.dhp.oa.model.Result;
 import eu.dnetlib.dhp.oa.model.community.CommunityResult;
 import eu.dnetlib.dhp.oa.model.graph.*;

--- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/SparkDumpFunderResults.java
+++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/SparkDumpFunderResults.java
@ -9,6 +9,8 @@ import java.util.Objects;
 import java.util.Optional;
 import java.util.stream.Collectors;

+import com.amazonaws.transform.SimpleTypeUnmarshallers;
+import io.netty.util.internal.StringUtil;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.FilterFunction;
@ -95,6 +97,8 @@ public class SparkDumpFunderResults implements Serializable {
 		Optional<Funder> ofunder = Optional.ofNullable(p.getFunder());
 		if (ofunder.isPresent()) {
 			String fName = ofunder.get().getShortName();
+			if(StringUtil.isNullOrEmpty(fName))
+				return ofunder.get().getName();
 			if (fName.equalsIgnoreCase("ec")) {
 				fName += "_" + ofunder.get().getFundingStream();
 			}
--- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/organizationonly/SparkDumpOrganizationJob.java
+++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/organizationonly/SparkDumpOrganizationJob.java
@ -0,0 +1,270 @@
+
+package eu.dnetlib.dhp.oa.graph.dump.organizationonly;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+import static eu.dnetlib.dhp.oa.graph.dump.Utils.ENTITY_ID_SEPARATOR;
+import static eu.dnetlib.dhp.oa.graph.dump.Utils.getEntityId;
+
+import java.io.Serializable;
+import java.io.StringReader;
+import java.util.*;
+import java.util.stream.Collectors;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.FilterFunction;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.dom4j.Document;
+import org.dom4j.DocumentException;
+import org.dom4j.Node;
+import org.dom4j.io.SAXReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.oa.graph.dump.Constants;
+import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
+import eu.dnetlib.dhp.oa.graph.dump.Utils;
+import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
+import eu.dnetlib.dhp.oa.graph.dump.exceptions.CardinalityTooHighException;
+import eu.dnetlib.dhp.oa.graph.dump.exceptions.NoAvailableEntityTypeException;
+import eu.dnetlib.dhp.oa.model.Container;
+import eu.dnetlib.dhp.oa.model.Provenance;
+import eu.dnetlib.dhp.oa.model.Result;
+import eu.dnetlib.dhp.oa.model.graph.*;
+import eu.dnetlib.dhp.oa.model.graph.Datasource;
+import eu.dnetlib.dhp.oa.model.graph.Organization;
+import eu.dnetlib.dhp.oa.model.graph.Project;
+import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import scala.Tuple2;
+
+/**
+ * Spark Job that fires the dump for the entites
+ */
+public class SparkDumpOrganizationJob implements Serializable {
+	private static final Logger log = LoggerFactory
+		.getLogger(eu.dnetlib.dhp.oa.graph.dump.organizationonly.SparkDumpOrganizationJob.class);
+	public static final String COMPRESSION = "compression";
+	public static final String GZIP = "gzip";
+
+	public static void main(String[] args) throws Exception {
+
+		Boolean isSparkSessionManaged = Boolean.TRUE;
+		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+		final String inputPath = "/tmp/prod_provision/graph/20_graph_blacklisted/";
+		log.info("inputPath: {}", inputPath);
+
+		final String outputPath = "/tmp/miriam/organizationsOnly/";
+		log.info("outputPath: {}", outputPath);
+		SparkConf conf = new SparkConf();
+		runWithSparkSession(
+			conf,
+			isSparkSessionManaged,
+			spark -> {
+				// Utils.removeOutputDir(spark, outputPath);
+				organizationMap(spark, inputPath, outputPath);
+				// relationMap2(spark, inputPath, outputPath);
+			});
+
+	}
+
+	private static void relationMap2(SparkSession spark, String inputPath, String outputPath) {
+		Utils
+			.readPath(spark, inputPath + "relation", Relation.class)
+			.filter((FilterFunction<Relation>) r -> r.getRelType().equalsIgnoreCase("organizationOrganization"))
+			.map((MapFunction<Relation, eu.dnetlib.dhp.oa.model.graph.Relation>) relation -> {
+				eu.dnetlib.dhp.oa.model.graph.Relation relNew = new eu.dnetlib.dhp.oa.model.graph.Relation();
+				relNew
+					.setSource(getEntityId(relation.getSource(), ENTITY_ID_SEPARATOR));
+				relNew.setSourceType(ModelSupport.idPrefixEntity.get(relation.getSource().substring(0, 2)));
+
+				relNew
+					.setTarget(getEntityId(relation.getTarget(), ENTITY_ID_SEPARATOR));
+				relNew.setTargetType(ModelSupport.idPrefixEntity.get(relation.getTarget().substring(0, 2)));
+
+				relNew
+					.setReltype(
+						RelType
+							.newInstance(
+								relation.getRelClass(),
+								relation.getSubRelType()));
+
+				Optional<DataInfo> odInfo = Optional.ofNullable(relation.getDataInfo());
+				if (odInfo.isPresent()) {
+					DataInfo dInfo = odInfo.get();
+					if (Optional.ofNullable(dInfo.getProvenanceaction()).isPresent() &&
+						Optional.ofNullable(dInfo.getProvenanceaction().getClassname()).isPresent()) {
+						relNew
+							.setProvenance(
+								Provenance
+									.newInstance(
+										dInfo.getProvenanceaction().getClassname(),
+										dInfo.getTrust()));
+					}
+				}
+				if (Boolean.TRUE.equals(relation.getValidated())) {
+					relNew.setValidated(relation.getValidated());
+					relNew.setValidationDate(relation.getValidationDate());
+				}
+
+				return relNew;
+			}, Encoders.bean(eu.dnetlib.dhp.oa.model.graph.Relation.class))
+			.write()
+			.mode(SaveMode.Overwrite)
+			.option("compression", "gzip")
+			.json(outputPath + "relation");
+	}
+
+	private static void relationMap(SparkSession spark, String inputPath, String outputPath) {
+		Dataset<eu.dnetlib.dhp.schema.oaf.Organization> organization = Utils
+			.readPath(spark, inputPath + "organization", eu.dnetlib.dhp.schema.oaf.Organization.class);
+		Dataset<Relation> rels = Utils.readPath(spark, inputPath + "relation", Relation.class);
+		organization
+			.joinWith(rels, organization.col("id").equalTo(rels.col("source")), "left")
+			.map(
+				(MapFunction<Tuple2<eu.dnetlib.dhp.schema.oaf.Organization, Relation>, Relation>) t2 -> t2._2(),
+				Encoders.bean(Relation.class))
+			.filter(Objects::nonNull)
+			.write()
+			.mode(SaveMode.Overwrite)
+			.option("compression", "gzip")
+			.json("/tmp/orgSource");
+
+		rels = Utils.readPath(spark, "/tmp/orgSource", Relation.class);
+
+		organization
+			.joinWith(rels, organization.col("id").equalTo(rels.col("target")), "left")
+			.map(
+				(MapFunction<Tuple2<eu.dnetlib.dhp.schema.oaf.Organization, Relation>, Relation>) t2 -> t2._2(),
+				Encoders.bean(Relation.class))
+			.filter(Objects::nonNull)
+			.write()
+			.mode(SaveMode.Overwrite)
+			.option("compression", "gzip")
+			.json("/tmp/orgSourceTarget");
+
+		Utils
+			.readPath(spark, "/tmp/orgSourceTarget", Relation.class)
+			.map((MapFunction<Relation, eu.dnetlib.dhp.oa.model.graph.Relation>) relation -> {
+				eu.dnetlib.dhp.oa.model.graph.Relation relNew = new eu.dnetlib.dhp.oa.model.graph.Relation();
+				relNew
+					.setSource(getEntityId(relation.getSource(), ENTITY_ID_SEPARATOR));
+				relNew.setSourceType(ModelSupport.idPrefixEntity.get(relation.getSource().substring(0, 2)));
+
+				relNew
+					.setTarget(getEntityId(relation.getTarget(), ENTITY_ID_SEPARATOR));
+				relNew.setTargetType(ModelSupport.idPrefixEntity.get(relation.getTarget().substring(0, 2)));
+
+				relNew
+					.setReltype(
+						RelType
+							.newInstance(
+								relation.getRelClass(),
+								relation.getSubRelType()));
+
+				Optional<DataInfo> odInfo = Optional.ofNullable(relation.getDataInfo());
+				if (odInfo.isPresent()) {
+					DataInfo dInfo = odInfo.get();
+					if (Optional.ofNullable(dInfo.getProvenanceaction()).isPresent() &&
+						Optional.ofNullable(dInfo.getProvenanceaction().getClassname()).isPresent()) {
+						relNew
+							.setProvenance(
+								Provenance
+									.newInstance(
+										dInfo.getProvenanceaction().getClassname(),
+										dInfo.getTrust()));
+					}
+				}
+				if (Boolean.TRUE.equals(relation.getValidated())) {
+					relNew.setValidated(relation.getValidated());
+					relNew.setValidationDate(relation.getValidationDate());
+				}
+
+				return relNew;
+			}, Encoders.bean(eu.dnetlib.dhp.oa.model.graph.Relation.class))
+			.write()
+			.mode(SaveMode.Overwrite)
+			.option("compression", "gzip")
+			.json(outputPath + "relation");
+	}
+
+	private static void organizationMap(SparkSession spark, String inputPath, String outputPath) {
+		Utils
+			.readPath(spark, inputPath + "organization", eu.dnetlib.dhp.schema.oaf.Organization.class)
+			.map(
+				(MapFunction<eu.dnetlib.dhp.schema.oaf.Organization, Organization>) o -> mapOrganization(o),
+				Encoders.bean(Organization.class))
+			.filter((FilterFunction<Organization>) o -> o != null)
+			.write()
+			.mode(SaveMode.Overwrite)
+			.option(COMPRESSION, GZIP)
+			.json(outputPath + "/organization");
+	}
+
+	private static eu.dnetlib.dhp.oa.model.graph.Organization mapOrganization(
+		eu.dnetlib.dhp.schema.oaf.Organization org) {
+
+		Organization organization = new Organization();
+
+		Optional
+			.ofNullable(org.getLegalshortname())
+			.ifPresent(value -> organization.setLegalshortname(value.getValue()));
+
+		Optional
+			.ofNullable(org.getLegalname())
+			.ifPresent(value -> organization.setLegalname(value.getValue()));
+
+		Optional
+			.ofNullable(org.getWebsiteurl())
+			.ifPresent(value -> organization.setWebsiteurl(value.getValue()));
+
+		Optional
+			.ofNullable(org.getAlternativeNames())
+			.ifPresent(
+				value -> organization
+					.setAlternativenames(
+						value
+							.stream()
+							.map(v -> v.getValue())
+							.collect(Collectors.toList())));
+
+		Optional
+			.ofNullable(org.getCountry())
+			.ifPresent(
+				value -> {
+					if (!value.getClassid().equals(eu.dnetlib.dhp.oa.graph.dump.complete.Constants.UNKNOWN)) {
+						organization
+							.setCountry(
+								eu.dnetlib.dhp.oa.model.Country.newInstance(value.getClassid(), value.getClassname()));
+					}
+
+				});
+
+		Optional
+			.ofNullable(org.getId())
+			.ifPresent(value -> organization.setId(getEntityId(value, ENTITY_ID_SEPARATOR)));
+
+		Optional
+			.ofNullable(org.getPid())
+			.ifPresent(
+				value -> organization
+					.setPid(
+						value
+							.stream()
+							.map(p -> OrganizationPid.newInstance(p.getQualifier().getClassid(), p.getValue()))
+							.collect(Collectors.toList())));
+
+		return organization;
+	}
+
+}
--- a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/organizationonly/oozie_app/config-default.xml
+++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/organizationonly/oozie_app/config-default.xml
@ -0,0 +1,30 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>hiveMetastoreUris</name>
+        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
+    </property>
+    <property>
+        <name>hiveJdbcUrl</name>
+        <value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
+    </property>
+    <property>
+        <name>hiveDbName</name>
+        <value>openaire</value>
+    </property>
+    <property>
+        <name>oozie.launcher.mapreduce.user.classpath.first</name>
+        <value>true</value>
+    </property>
+</configuration>
--- a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/organizationonly/oozie_app/workflow.xml
+++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/organizationonly/oozie_app/workflow.xml
@ -0,0 +1,88 @@
+<workflow-app name="dump_graph" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
+        </property>
+        <property>
+            <name>oozieActionShareLibForSpark2</name>
+            <description>oozie action sharelib for spark 2.*</description>
+        </property>
+        <property>
+            <name>spark2ExtraListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
+            <description>spark 2.* extra listeners classname</description>
+        </property>
+        <property>
+            <name>spark2SqlQueryExecutionListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
+            <description>spark 2.* sql query execution listeners classname</description>
+        </property>
+        <property>
+            <name>spark2YarnHistoryServerAddress</name>
+            <description>spark 2.* yarn history server address</description>
+        </property>
+        <property>
+            <name>spark2EventLogDir</name>
+            <description>spark 2.* event log dir location</description>
+        </property>
+    </parameters>
+    <global>
+        <job-tracker>${jobTracker}</job-tracker>
+        <name-node>${nameNode}</name-node>
+        <configuration>
+            <property>
+                <name>mapreduce.job.queuename</name>
+                <value>${queueName}</value>
+            </property>
+            <property>
+                <name>oozie.launcher.mapred.job.queue.name</name>
+                <value>${oozieLauncherQueueName}</value>
+            </property>
+            <property>
+                <name>oozie.action.sharelib.for.spark</name>
+                <value>${oozieActionShareLibForSpark2}</value>
+            </property>
+        </configuration>
+    </global>
+    <start to="dump_organization"/>
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+    <action name="dump_organization">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Dump table organization and related relations </name>
+            <class>eu.dnetlib.dhp.oa.graph.dump.organizationonly.SparkDumpOrganizationJob</class>
+            <jar>dump-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+            </spark-opts>
+            <arg>--sourcePath</arg><arg>${sourcePath}/project</arg>
+            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/project</arg>
+            <arg>--communityMapPath</arg><arg>noneed</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <end name="End"/>
+</workflow-app>