2024-07-29 11:03:57 +02:00
3 changed files with 52 additions and 126 deletions
--- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/organizationonly/SparkDumpOrganizationJob.java
+++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/organizationonly/SparkDumpOrganizationJob.java
@ -33,6 +33,7 @@ import eu.dnetlib.dhp.oa.graph.dump.Constants;
 import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
 import eu.dnetlib.dhp.oa.graph.dump.Utils;
 import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
+import eu.dnetlib.dhp.oa.graph.dump.complete.SparkOrganizationRelation;
 import eu.dnetlib.dhp.oa.graph.dump.exceptions.CardinalityTooHighException;
 import eu.dnetlib.dhp.oa.graph.dump.exceptions.NoAvailableEntityTypeException;
 import eu.dnetlib.dhp.oa.model.Container;
@ -57,150 +58,45 @@ public class SparkDumpOrganizationJob implements Serializable {
 	public static final String GZIP = "gzip";

 	public static void main(String[] args) throws Exception {
+		String jsonConfiguration = IOUtils
+			.toString(
+				SparkOrganizationRelation.class
+					.getResourceAsStream(
+						"/eu/dnetlib/dhp/oa/graph/dump/organization_only_input_parameters.json"));

-		Boolean isSparkSessionManaged = Boolean.TRUE;
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+		parser.parseArgument(args);
+
+		Boolean isSparkSessionManaged = Optional
+			.ofNullable(parser.get("isSparkSessionManaged"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);

-		final String inputPath = "/tmp/prod_provision/graph/20_graph_blacklisted/";
+		final String inputPath = parser.get("sourcePath");
 		log.info("inputPath: {}", inputPath);

-		final String outputPath = "/tmp/miriam/organizationsOnly/";
+		final String outputPath = parser.get("outputPath");
 		log.info("outputPath: {}", outputPath);
+
 		SparkConf conf = new SparkConf();
 		runWithSparkSession(
 			conf,
 			isSparkSessionManaged,
 			spark -> {
-				// Utils.removeOutputDir(spark, outputPath);
+				Utils.removeOutputDir(spark, outputPath);
 				organizationMap(spark, inputPath, outputPath);
-				// relationMap2(spark, inputPath, outputPath);
+
 			});

 	}

-	private static void relationMap2(SparkSession spark, String inputPath, String outputPath) {
-		Utils
-			.readPath(spark, inputPath + "relation", Relation.class)
-			.filter((FilterFunction<Relation>) r -> r.getRelType().equalsIgnoreCase("organizationOrganization"))
-			.map((MapFunction<Relation, eu.dnetlib.dhp.oa.model.graph.Relation>) relation -> {
-				eu.dnetlib.dhp.oa.model.graph.Relation relNew = new eu.dnetlib.dhp.oa.model.graph.Relation();
-				relNew
-					.setSource(getEntityId(relation.getSource(), ENTITY_ID_SEPARATOR));
-				relNew.setSourceType(ModelSupport.idPrefixEntity.get(relation.getSource().substring(0, 2)));
-
-				relNew
-					.setTarget(getEntityId(relation.getTarget(), ENTITY_ID_SEPARATOR));
-				relNew.setTargetType(ModelSupport.idPrefixEntity.get(relation.getTarget().substring(0, 2)));
-
-				relNew
-					.setRelType(
-						RelType
-							.newInstance(
-								relation.getRelClass(),
-								relation.getSubRelType()));
-
-				Optional<DataInfo> odInfo = Optional.ofNullable(relation.getDataInfo());
-				if (odInfo.isPresent()) {
-					DataInfo dInfo = odInfo.get();
-					if (Optional.ofNullable(dInfo.getProvenanceaction()).isPresent() &&
-						Optional.ofNullable(dInfo.getProvenanceaction().getClassname()).isPresent()) {
-						relNew
-							.setProvenance(
-								Provenance
-									.newInstance(
-										dInfo.getProvenanceaction().getClassname(),
-										dInfo.getTrust()));
-					}
-				}
-				if (Boolean.TRUE.equals(relation.getValidated())) {
-					relNew.setValidated(relation.getValidated());
-					relNew.setValidationDate(relation.getValidationDate());
-				}
-
-				return relNew;
-			}, Encoders.bean(eu.dnetlib.dhp.oa.model.graph.Relation.class))
-			.write()
-			.mode(SaveMode.Overwrite)
-			.option("compression", "gzip")
-			.json(outputPath + "relation");
-	}
-
-	private static void relationMap(SparkSession spark, String inputPath, String outputPath) {
-		Dataset<eu.dnetlib.dhp.schema.oaf.Organization> organization = Utils
-			.readPath(spark, inputPath + "organization", eu.dnetlib.dhp.schema.oaf.Organization.class);
-		Dataset<Relation> rels = Utils.readPath(spark, inputPath + "relation", Relation.class);
-		organization
-			.joinWith(rels, organization.col("id").equalTo(rels.col("source")), "left")
-			.map(
-				(MapFunction<Tuple2<eu.dnetlib.dhp.schema.oaf.Organization, Relation>, Relation>) t2 -> t2._2(),
-				Encoders.bean(Relation.class))
-			.filter(Objects::nonNull)
-			.write()
-			.mode(SaveMode.Overwrite)
-			.option("compression", "gzip")
-			.json("/tmp/orgSource");
-
-		rels = Utils.readPath(spark, "/tmp/orgSource", Relation.class);
-
-		organization
-			.joinWith(rels, organization.col("id").equalTo(rels.col("target")), "left")
-			.map(
-				(MapFunction<Tuple2<eu.dnetlib.dhp.schema.oaf.Organization, Relation>, Relation>) t2 -> t2._2(),
-				Encoders.bean(Relation.class))
-			.filter(Objects::nonNull)
-			.write()
-			.mode(SaveMode.Overwrite)
-			.option("compression", "gzip")
-			.json("/tmp/orgSourceTarget");
-
-		Utils
-			.readPath(spark, "/tmp/orgSourceTarget", Relation.class)
-			.map((MapFunction<Relation, eu.dnetlib.dhp.oa.model.graph.Relation>) relation -> {
-				eu.dnetlib.dhp.oa.model.graph.Relation relNew = new eu.dnetlib.dhp.oa.model.graph.Relation();
-				relNew
-					.setSource(getEntityId(relation.getSource(), ENTITY_ID_SEPARATOR));
-				relNew.setSourceType(ModelSupport.idPrefixEntity.get(relation.getSource().substring(0, 2)));
-
-				relNew
-					.setTarget(getEntityId(relation.getTarget(), ENTITY_ID_SEPARATOR));
-				relNew.setTargetType(ModelSupport.idPrefixEntity.get(relation.getTarget().substring(0, 2)));
-
-				relNew
-					.setRelType(
-						RelType
-							.newInstance(
-								relation.getRelClass(),
-								relation.getSubRelType()));
-
-				Optional<DataInfo> odInfo = Optional.ofNullable(relation.getDataInfo());
-				if (odInfo.isPresent()) {
-					DataInfo dInfo = odInfo.get();
-					if (Optional.ofNullable(dInfo.getProvenanceaction()).isPresent() &&
-						Optional.ofNullable(dInfo.getProvenanceaction().getClassname()).isPresent()) {
-						relNew
-							.setProvenance(
-								Provenance
-									.newInstance(
-										dInfo.getProvenanceaction().getClassname(),
-										dInfo.getTrust()));
-					}
-				}
-				if (Boolean.TRUE.equals(relation.getValidated())) {
-					relNew.setValidated(relation.getValidated());
-					relNew.setValidationDate(relation.getValidationDate());
-				}
-
-				return relNew;
-			}, Encoders.bean(eu.dnetlib.dhp.oa.model.graph.Relation.class))
-			.write()
-			.mode(SaveMode.Overwrite)
-			.option("compression", "gzip")
-			.json(outputPath + "relation");
-	}
-
 	private static void organizationMap(SparkSession spark, String inputPath, String outputPath) {
 		Utils
 			.readPath(spark, inputPath + "organization", eu.dnetlib.dhp.schema.oaf.Organization.class)
+			.filter(
+				(FilterFunction<eu.dnetlib.dhp.schema.oaf.Organization>) o -> !o.getDataInfo().getDeletedbyinference()
+					&& o.getId().startsWith("20|openorgs"))
 			.map(
 				(MapFunction<eu.dnetlib.dhp.schema.oaf.Organization, Organization>) o -> mapOrganization(o),
 				Encoders.bean(Organization.class))
--- a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/organization_only_input_parameters.json
+++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/organization_only_input_parameters.json
@ -0,0 +1,23 @@
+[
+
+	{
+		"paramName":"s",
+		"paramLongName":"sourcePath",
+		"paramDescription": "the path of the sequencial file to read",
+		"paramRequired": true
+	},
+	{
+		"paramName": "out",
+		"paramLongName": "outputPath",
+		"paramDescription": "the path used to store temporary output files",
+		"paramRequired": true
+	},
+	{
+		"paramName": "ssm",
+		"paramLongName": "isSparkSessionManaged",
+		"paramDescription": "true if the spark session is managed, false otherwise",
+		"paramRequired": false
+	}
+]
+
+
--- a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/organizationonly/oozie_app/workflow.xml
+++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/organizationonly/oozie_app/workflow.xml
@ -1,6 +1,13 @@
 <workflow-app name="dump_graph" xmlns="uri:oozie:workflow:0.5">
    <parameters>
-
+        <property>
+            <name>sourcePath</name>
+            <description>the graph source path</description>
+        </property>
+        <property>
+            <name>outputPath</name>
+            <description>the dump output path</description>
+        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>