diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/ExtendEoscResultWithOrganizationStep2.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/ExtendEoscResultWithOrganizationStep2.java index 57a3e24..8d106ef 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/ExtendEoscResultWithOrganizationStep2.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/ExtendEoscResultWithOrganizationStep2.java @@ -101,9 +101,6 @@ public class ExtendEoscResultWithOrganizationStep2 implements Serializable { .joinWith(relation, result.col("id").equalTo(relation.col("source"))) .map((MapFunction, Relation>) t2 -> t2._2(), Encoders.bean(Relation.class)); - eoscRelation - .foreach((ForeachFunction) r -> System.out.println(new ObjectMapper().writeValueAsString(r))); - // from eoscRelation select the organization eoscRelation .joinWith(organization, eoscRelation.col("target").equalTo(organization.col("id"))) diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultsJobStep1.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultsJobStep1.java index 631058a..65fcd54 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultsJobStep1.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultsJobStep1.java @@ -77,17 +77,6 @@ public class SelectEoscResultsJobStep1 implements Serializable { String inputPath, String outputPath, Class inputClazz, String communityMapPath, String eoscDatasourceIdsPath) { -// final StructType structureSchema = new StructType() -// .add("eoscId", DataTypes.StringType) -// .add("graphId", DataTypes.StringType) -// .add("graphName", DataTypes.StringType); -// -// // .fromDDL("`graphId`: STRING, `eoscId`:STRING"); -// org.apache.spark.sql.Dataset df = spark -// .read() -// .schema(structureSchema) -// .json(eoscDatasourceIdsPath); - List df = Utils .readPath(spark, eoscDatasourceIdsPath, MasterDuplicate.class) .collectAsList(); diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SparkUpdateProjectInfo.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SparkUpdateProjectInfo.java index e9fdb87..0f0859e 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SparkUpdateProjectInfo.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SparkUpdateProjectInfo.java @@ -112,7 +112,7 @@ public class SparkUpdateProjectInfo implements Serializable { Dataset project = Utils.readPath(spark, inputPath + "/project", Project.class); Dataset projectIds = result - .joinWith(resultProject, result.col("id").equalTo(resultProject.col("resultId"))) + .joinWith(resultProject, result.col("id").equalTo(resultProject.col("resultId")), "left") .flatMap( (FlatMapFunction, String>) t2 -> t2 ._2() @@ -134,7 +134,13 @@ public class SparkUpdateProjectInfo implements Serializable { .option("compression", "gzip") .json(outputPath + "project"); - resultProject + result + .joinWith( + resultProject, result.col("id").equalTo(resultProject.col("resultId")), + "left") + .map( + (MapFunction, ResultProject>) t2 -> t2._2(), + Encoders.bean(ResultProject.class)) .flatMap( (FlatMapFunction) rp -> rp .getProjectsList() diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/job.properties b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/job.properties new file mode 100644 index 0000000..d22cf14 --- /dev/null +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/job.properties @@ -0,0 +1,15 @@ +#PROPERTIES FOR EOSC DUMP +sourcePath=/tmp/prod_provision/graph/18_graph_blacklisted/ +outputPath=/tmp/miriam/graph_dumps/eosc_prod_extended +#accessToken for the openaire sandbox following +accessToken=OzzOsyucEIHxCEfhlpsMo3myEiwpCza3trCRL7ddfGTAK9xXkIP2MbXd6Vg4 +connectionUrl=https://sandbox.zenodo.org/api/deposit/depositions +singleDeposition=false +conceptRecordId=1094304 +depositionType=version +metadata="" +depositionId=6616871 +removeSet=merges;isMergedIn +postgresURL=jdbc:postgresql://postgresql.services.openaire.eu:5432/dnet_openaireplus +postgresUser=dnet +postgresPassword=dnetPwd \ No newline at end of file diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/oozie_app/workflow.xml b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/oozie_app/workflow.xml index c435fae..1513c70 100644 --- a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/oozie_app/workflow.xml +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/oozie_app/workflow.xml @@ -225,8 +225,6 @@ - - yarn