From 774cdb190e498da204f162d1497f27dd68fb880a Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 13 Jul 2021 18:57:24 +0200 Subject: [PATCH] changes to mirror the last dump of the graph with the ols data model. --- .../dnetlib/doiboost/mag/MAGMappingTest.scala | 24 +++++-- .../dhp/oa/graph/dump/DumpProducts.java | 3 +- .../dhp/oa/graph/dump/SendToZenodoHDFS.java | 12 ++-- .../graph/dump/community/CommunitySplit.java | 10 +-- .../funderresults/SparkDumpFunderResults.java | 17 +++-- .../dump/complete/schema/result_schema.json | 70 ++++++++++++++++++- .../dump/QueryInformationSystemTest.java | 2 +- .../dump/complete/CreateRelationTest.java | 2 +- .../dump/funderresult/SplitPerFunderTest.java | 4 +- 9 files changed, 113 insertions(+), 31 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala index 7eb50665e..d43e7ed37 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala @@ -70,9 +70,15 @@ class MAGMappingTest { implicit val formats = DefaultFormats - val conf = new SparkConf().setAppName("test").setMaster("local[2]") - val sc = new SparkContext(conf) - val spark = SparkSession.builder.config(sc.getConf).getOrCreate() + + val conf = new SparkConf().setAppName("test").setMaster("local[*]").set("spark.driver.host", "localhost") + + val spark: SparkSession = + SparkSession + .builder() + .appName(getClass.getSimpleName) + .config(conf) + .getOrCreate() val path = getClass.getResource("magPapers.json").getPath import org.apache.spark.sql.Encoders @@ -95,9 +101,15 @@ class MAGMappingTest { implicit val formats = DefaultFormats - val conf = new SparkConf().setAppName("test").setMaster("local[2]") - val sc = new SparkContext(conf) - val spark = SparkSession.builder.config(sc.getConf).getOrCreate() + + val conf = new SparkConf().setAppName("test").setMaster("local[*]").set("spark.driver.host", "localhost") + + val spark: SparkSession = + SparkSession + .builder() + .appName(getClass.getSimpleName) + .config(conf) + .getOrCreate() val path = getClass.getResource("duplicatedMagPapers.json").getPath import org.apache.spark.sql.Encoders diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/DumpProducts.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/DumpProducts.java index f5b8f1c76..4ddcea9e8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/DumpProducts.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/DumpProducts.java @@ -37,7 +37,8 @@ public class DumpProducts implements Serializable { isSparkSessionManaged, spark -> { Utils.removeOutputDir(spark, outputPath); - execDump(spark, inputPath, outputPath, communityMapPath, inputClazz, outputClazz, dumpType); + execDump( + spark, inputPath, outputPath, communityMapPath, inputClazz, outputClazz, dumpType); }); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/SendToZenodoHDFS.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/SendToZenodoHDFS.java index fd8262544..28881b253 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/SendToZenodoHDFS.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/SendToZenodoHDFS.java @@ -48,14 +48,14 @@ public class SendToZenodoHDFS implements Serializable { .orElse(false); final String depositionId = Optional.ofNullable(parser.get("depositionId")).orElse(null); - final String communityMapPath = parser.get("communityMapPath"); + //final String communityMapPath = parser.get("communityMapPath"); Configuration conf = new Configuration(); conf.set("fs.defaultFS", hdfsNameNode); FileSystem fileSystem = FileSystem.get(conf); - CommunityMap communityMap = Utils.readCommunityMap(fileSystem, communityMapPath); + //CommunityMap communityMap = Utils.readCommunityMap(fileSystem, communityMapPath); RemoteIterator fileStatusListIterator = fileSystem .listFiles( @@ -87,10 +87,10 @@ public class SendToZenodoHDFS implements Serializable { if (!p_string.endsWith("_SUCCESS")) { // String tmp = p_string.substring(0, p_string.lastIndexOf("/")); String name = p_string.substring(p_string.lastIndexOf("/") + 1); - log.info("Sending information for community: " + name); - if (communityMap.containsKey(name.substring(0, name.lastIndexOf(".")))) { - name = communityMap.get(name.substring(0, name.lastIndexOf("."))).replace(" ", "_") + ".tar"; - } +// log.info("Sending information for community: " + name); +// if (communityMap.containsKey(name.substring(0, name.lastIndexOf(".")))) { +// name = communityMap.get(name.substring(0, name.lastIndexOf("."))).replace(" ", "_") + ".tar"; +// } FSDataInputStream inputStream = fileSystem.open(p); zenodoApiClient.uploadIS(inputStream, name, fileStatus.getLen()); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/community/CommunitySplit.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/community/CommunitySplit.java index ec16a65e0..d64c26503 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/community/CommunitySplit.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/community/CommunitySplit.java @@ -34,12 +34,13 @@ public class CommunitySplit implements Serializable { isSparkSessionManaged, spark -> { Utils.removeOutputDir(spark, outputPath); - execSplit(spark, inputPath, outputPath, Utils.getCommunityMap(spark, communityMapPath).keySet()); + CommunityMap communityMap = Utils.getCommunityMap(spark, communityMapPath); + execSplit(spark, inputPath, outputPath, communityMap); }); } private static void execSplit(SparkSession spark, String inputPath, String outputPath, - Set communities) { + CommunityMap communities) { Dataset result = Utils .readPath(spark, inputPath + "/publication", CommunityResult.class) @@ -48,8 +49,9 @@ public class CommunitySplit implements Serializable { .union(Utils.readPath(spark, inputPath + "/software", CommunityResult.class)); communities + .keySet() .stream() - .forEach(c -> printResult(c, result, outputPath)); + .forEach(c -> printResult(c, result, outputPath + "/" + communities.get(c).replace(" ", "_"))); } @@ -61,7 +63,7 @@ public class CommunitySplit implements Serializable { .write() .option("compression", "gzip") .mode(SaveMode.Overwrite) - .json(outputPath + "/" + c); + .json(outputPath); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/SparkDumpFunderResults.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/SparkDumpFunderResults.java index b124b76ce..9fcdcaf78 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/SparkDumpFunderResults.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/SparkDumpFunderResults.java @@ -103,12 +103,13 @@ public class SparkDumpFunderResults implements Serializable { } else { funderdump = fundernsp.substring(0, fundernsp.indexOf("_")).toUpperCase(); } - writeFunderResult(funder, result, outputPath , funderdump); + writeFunderResult(funder, result, outputPath, funderdump); }); } - private static void dumpResults(String nsp, Dataset results, String outputPath, String funderName) { + private static void dumpResults(String nsp, Dataset results, String outputPath, + String funderName) { results.map((MapFunction) r -> { if (!Optional.ofNullable(r.getProjects()).isPresent()) { @@ -128,24 +129,22 @@ public class SparkDumpFunderResults implements Serializable { } return null; }, Encoders.bean(CommunityResult.class)) - .filter((FilterFunction) r -> r!= null) + .filter((FilterFunction) r -> r != null) .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(outputPath + "/" + funderName); } - private static void writeFunderResult(String funder, Dataset results, String outputPath, - String funderDump) { + String funderDump) { if (funder.startsWith("40|irb")) { - dumpResults(funder, results, outputPath, "HRZZ"); - dumpResults(funder, results, outputPath, "MZOS"); + dumpResults(funder, results, outputPath, "HRZZ"); + dumpResults(funder, results, outputPath, "MZOS"); } else - dumpResults(funder, results, outputPath, funderDump); + dumpResults(funder, results, outputPath, funderDump); } - } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/result_schema.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/result_schema.json index 867fd5a77..03cbfb074 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/result_schema.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/result_schema.json @@ -1,6 +1,23 @@ { "$schema": "http://json-schema.org/draft-07/schema#", "definitions": { + "AccessRight":{ + "type":"object", + "properties":{ + "code": { + "type": "string", + "description": "COAR access mode code: http://vocabularies.coar-repositories.org/documentation/access_rights/" + }, + "label": { + "type": "string", + "description": "Label for the access mode" + }, + "scheme": { + "type": "string", + "description": "Scheme of reference for access right code. Always set to COAR access rights vocabulary: http://vocabularies.coar-repositories.org/documentation/access_rights/" + } + } + }, "ControlledField": { "type": "object", "properties": { @@ -266,6 +283,57 @@ ] } }, + "instance":{ + "type":"array", + "items":{ + "type":"object", + "properties":{ + "accessright":{ + "allOf":[ + { + "$ref":"#/definitions/AccessRight" + }, + { + "description":"The accessright of this materialization of the result" + } + ] + }, + "articleprocessingcharge":{ + "type":"object", + "properties":{ + "amount":{ + "type":"string" + }, + "currency":{ + "type":"string" + } + } + }, + "license":{ + "type":"string" + }, + "publicationdate":{ + "type":"string" + }, + "refereed":{ + "type":"string" + }, + "type":{ + "type":"string", + "description":"The specific sub-type of this materialization of the result (see https://api.openaire.eu/vocabularies/dnet:result_typologies following the links)" + }, + "url":{ + "description":"Description of url", + "type":"array", + "items":{ + "type":"string", + "description":"urls where it is possible to access the materialization of the result" + } + } + }, + "description":"One of the materialization for this result" + } + }, "programmingLanguage": { "type": "string", "description": "Only for results with type 'software': the programming language" @@ -302,7 +370,7 @@ "subject": { "allOf": [ {"$ref": "#/definitions/ControlledField"}, - {"description": "OpenAIRE subject classification scheme (https://api.openaire.eu/vocabularies/dnet:subject_classification_typologies) and value. When the scheme is 'keyword', it means that the subject is free-text (i.e. not a term from a controlled vocabulary)."}, + {"description": "OpenAIRE subject classification scheme (https://api.openaire.eu/vocabularies/dnet:subject_classification_typologies) and value. When the scheme is 'keyword', it means that the subject is free-text (i.e. not a term from a controlled vocabulary)."} ] } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/QueryInformationSystemTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/QueryInformationSystemTest.java index c6666342a..499df3a09 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/QueryInformationSystemTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/QueryInformationSystemTest.java @@ -70,7 +70,7 @@ public class QueryInformationSystemTest { lenient().when(isLookUpService.quickSearchProfile(XQUERY)).thenReturn(communityMap); queryInformationSystem = new QueryInformationSystem(); queryInformationSystem.setIsLookUp(isLookUpService); - map = queryInformationSystem.getCommunityMap(); + map = queryInformationSystem.getCommunityMap(false, null); } @Test diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/complete/CreateRelationTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/complete/CreateRelationTest.java index 5f21e0bc5..448034d0c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/complete/CreateRelationTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/complete/CreateRelationTest.java @@ -379,7 +379,7 @@ public class CreateRelationTest { " \n" + " 675858\n" + " \n" + - " H2020-EINFRA-2015-1\n" + + " EC | H2020 | RIA\n" + " EC\n" + " West-Life\n" + " \n" + diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/funderresult/SplitPerFunderTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/funderresult/SplitPerFunderTest.java index eefb0b9cb..26187f0a3 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/funderresult/SplitPerFunderTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/funderresult/SplitPerFunderTest.java @@ -139,8 +139,8 @@ public class SplitPerFunderTest { // MZOS 1 tmp = sc - .textFile(workingDir.toString() + "/split/MZOS") - .map(item -> OBJECT_MAPPER.readValue(item, CommunityResult.class)); + .textFile(workingDir.toString() + "/split/MZOS") + .map(item -> OBJECT_MAPPER.readValue(item, CommunityResult.class)); Assertions.assertEquals(1, tmp.count()); }