From 0ce49049d6e9ad161943043c62da0ce446ce5b29 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 11 Aug 2020 15:25:11 +0200 Subject: [PATCH] added description --- .../dhp/oa/graph/dump/DumpProducts.java | 45 ++----------------- .../dhp/oa/graph/dump/graph/ContextInfo.java | 4 ++ .../dump/graph/CreateContextEntities.java | 19 +++----- .../dump/graph/CreateContextRelation.java | 10 +++-- .../graph/dump/graph/DumpGraphEntities.java | 6 ++- .../dhp/oa/graph/dump/graph/Extractor.java | 15 +++++-- .../dhp/oa/graph/dump/graph/Process.java | 8 ++-- .../dump/graph/QueryInformationSystem.java | 7 --- .../graph/dump/graph/SparkCollectAndSave.java | 8 +++- .../dump/graph/SparkDumpEntitiesJob.java | 17 ++----- .../dump/graph/SparkDumpRelationJob.java | 4 +- .../SparkExtractRelationFromEntities.java | 16 +++---- .../dump/graph/SparkOrganizationRelation.java | 5 ++- 13 files changed, 64 insertions(+), 100 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/DumpProducts.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/DumpProducts.java index 9f32d1d6aa..80de40f50f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/DumpProducts.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/DumpProducts.java @@ -1,3 +1,6 @@ +/** + * + */ package eu.dnetlib.dhp.oa.graph.dump; @@ -36,50 +39,10 @@ public class DumpProducts implements Serializable { isSparkSessionManaged, spark -> { Utils.removeOutputDir(spark, outputPath); - execDump(spark, inputPath, outputPath, communityMapPath, inputClazz, outputClazz, graph);// , - // dumpClazz); + execDump(spark, inputPath, outputPath, communityMapPath, inputClazz, outputClazz, graph); }); } -// public void run(Boolean isSparkSessionManaged, String inputPath, String outputPath, CommunityMap communityMap, -// Class inputClazz, -// Class outputClazz, -// boolean graph) { -// -// SparkConf conf = new SparkConf(); -// -// runWithSparkSession( -// conf, -// isSparkSessionManaged, -// spark -> { -// Utils.removeOutputDir(spark, outputPath); -// execDump(spark, inputPath, outputPath, communityMap, inputClazz, outputClazz, graph);// , -// // dumpClazz); -// }); -// } - -// public static void execDump( -// SparkSession spark, -// String inputPath, -// String outputPath, -// CommunityMap communityMap, -// Class inputClazz, -// Class outputClazz, -// boolean graph) { -// -// // CommunityMap communityMap = Utils.getCommunityMap(spark, communityMapPath); -// -// Utils -// .readPath(spark, inputPath, inputClazz) -// .map(value -> execMap(value, communityMap, graph), Encoders.bean(outputClazz)) -// .filter(Objects::nonNull) -// .write() -// .mode(SaveMode.Overwrite) -// .option("compression", "gzip") -// .json(outputPath); -// -// } - public static void execDump( SparkSession spark, String inputPath, diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/ContextInfo.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/ContextInfo.java index 041dbf5f77..a76fea8f44 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/ContextInfo.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/ContextInfo.java @@ -1,3 +1,7 @@ +/** + * Deserialization of the information in the context needed to create Context Entities, and relations between + * context entities and datasources and projects + */ package eu.dnetlib.dhp.oa.graph.dump.graph; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/CreateContextEntities.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/CreateContextEntities.java index 8cda0c8a2e..37d86f0630 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/CreateContextEntities.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/CreateContextEntities.java @@ -1,3 +1,9 @@ +/** + * Writes on HDFS Context entities. It queries the Information System at the lookup url provided as parameter + * and collects the general information for contexes of type community or ri. The general information is + * the id of the context, its label, the subjects associated to the context, its zenodo community, description and type. + * This information is used to create a new Context Entity + */ package eu.dnetlib.dhp.oa.graph.dump.graph; @@ -6,13 +12,9 @@ import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Serializable; import java.nio.charset.StandardCharsets; -import java.util.List; -import java.util.Optional; import java.util.function.Consumer; import java.util.function.Function; -import javax.rmi.CORBA.Util; - import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; @@ -21,19 +23,12 @@ import org.apache.hadoop.fs.Path; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.gson.Gson; - import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.oa.graph.dump.Utils; -import eu.dnetlib.dhp.schema.dump.oaf.graph.ResearchCommunity; import eu.dnetlib.dhp.schema.dump.oaf.graph.ResearchInitiative; public class CreateContextEntities implements Serializable { - // leggo i context dall'is e mi faccio la mappa id -> contextinfo - - // creo le entities con le info generali - private static final Logger log = LoggerFactory.getLogger(CreateContextEntities.class); private final Configuration conf; private final BufferedWriter writer; @@ -51,7 +46,7 @@ public class CreateContextEntities implements Serializable { final String hdfsPath = parser.get("hdfsPath"); log.info("hdfsPath: {}", hdfsPath); - final String hdfsNameNode = parser.get("hdfsNameNode"); + final String hdfsNameNode = parser.get("nameNode"); log.info("nameNode: {}", hdfsNameNode); final String isLookUpUrl = parser.get("isLookUpUrl"); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/CreateContextRelation.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/CreateContextRelation.java index e5a2ee36ba..2e5a56d3d3 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/CreateContextRelation.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/CreateContextRelation.java @@ -1,3 +1,8 @@ +/** + * Writes the set of new Relation between the context and datasources. At the moment the relation + * between the context and the project is not created because of a low coverage in the profiles of + * openaire ids related to projects + */ package eu.dnetlib.dhp.oa.graph.dump.graph; @@ -54,7 +59,7 @@ public class CreateContextRelation implements Serializable { final String hdfsPath = parser.get("hdfsPath"); log.info("hdfsPath: {}", hdfsPath); - final String hdfsNameNode = parser.get("hdfsNameNode"); + final String hdfsNameNode = parser.get("nameNode"); log.info("nameNode: {}", hdfsNameNode); final String isLookUpUrl = parser.get("isLookUpUrl"); @@ -101,8 +106,7 @@ public class CreateContextRelation implements Serializable { } - public void execute(final Function> producer, String category, String prefix) - throws Exception { + public void execute(final Function> producer, String category, String prefix) { final Consumer consumer = ci -> producer.apply(ci).forEach(c -> writeEntity(c)); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/DumpGraphEntities.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/DumpGraphEntities.java index 124d57c9d5..279700dad7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/DumpGraphEntities.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/DumpGraphEntities.java @@ -1,3 +1,8 @@ +/** + * Dumps of entities in the model defined in eu.dnetlib.dhp.schema.dump.oaf.graph. + * Results are dumped using the same Mapper as for eu.dnetlib.dhp.schema.dump.oaf.community, while for + * the other entities the mapping is defined below + */ package eu.dnetlib.dhp.oa.graph.dump.graph; @@ -104,7 +109,6 @@ public class DumpGraphEntities implements Serializable { private static Datasource mapDatasource(eu.dnetlib.dhp.schema.oaf.Datasource d) { Datasource datasource = new Datasource(); - datasource.setId(d.getId()); Optional diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/Extractor.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/Extractor.java index 6a1ad9a406..7baab26834 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/Extractor.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/Extractor.java @@ -1,3 +1,12 @@ +/** + * Creates new Relations (as in eu.dnetlib.dhp.schema.dump.oaf.graph.Relation) from the information in the Entity. + * The new Relations are created for the datasource in the collectedfrom and hostedby elements and for the context + * related to communities and research initiative/infrastructures. + * + * For collectedfrom elements it creates: datasource -> provides -> result and result -> isProvidedBy -> datasource + * For hostedby elements it creates: datasource -> hosts -> result and result -> isHostedBy -> datasource + * For context elements it creates: context <-> isRelatedTo <-> result + */ package eu.dnetlib.dhp.oa.graph.dump.graph; @@ -24,12 +33,11 @@ import eu.dnetlib.dhp.schema.oaf.Result; public class Extractor implements Serializable { - public void run(Boolean isSparkSessionManaged, String inputPath, String outputPath, Class inputClazz, - CommunityMap communityMap) { + String communityMapPath) { SparkConf conf = new SparkConf(); @@ -39,11 +47,10 @@ public class Extractor implements Serializable { spark -> { Utils.removeOutputDir(spark, outputPath); extractRelationResult( - spark, inputPath, outputPath, inputClazz, communityMap); + spark, inputPath, outputPath, inputClazz, Utils.getCommunityMap(spark, communityMapPath)); }); } - private void extractRelationResult(SparkSession spark, String inputPath, String outputPath, diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/Process.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/Process.java index b17ed3b2a8..f806e58ddd 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/Process.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/Process.java @@ -1,3 +1,8 @@ +/** + * It process the ContextInfo information to produce a new Context Entity or a set of Relations between the + * generic context entity and datasource/projects related to the context. + * + */ package eu.dnetlib.dhp.oa.graph.dump.graph; @@ -8,8 +13,6 @@ import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.gson.Gson; - import eu.dnetlib.dhp.oa.graph.dump.Constants; import eu.dnetlib.dhp.oa.graph.dump.Utils; import eu.dnetlib.dhp.schema.common.ModelConstants; @@ -37,7 +40,6 @@ public class Process implements Serializable { ri.setDescription(ci.getDescription()); ri.setName(ci.getName()); ri.setZenodo_community(Constants.ZENODO_COMMUNITY_PREFIX + ci.getZenodocommunity()); - // log.info("created context: {}", new Gson().toJson(ri)); return (R) ri; } catch (final Exception e) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/QueryInformationSystem.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/QueryInformationSystem.java index e75201b0b7..e74d8a44c7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/QueryInformationSystem.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/QueryInformationSystem.java @@ -126,13 +126,6 @@ public class QueryInformationSystem { } } -// cat_iterator = el.elementIterator(); -// while (cat_iterator.hasNext()) { -// Element catEl = (Element) cat_iterator.next(); -// if (catEl.getName().equals("param") && catEl.attribute("name").getValue().equals("openaireId")) { -// datasourceList.add(catEl.getText()); -// } -// } return datasourceList; } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/SparkCollectAndSave.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/SparkCollectAndSave.java index 50e2aac1cd..a0bfaa643b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/SparkCollectAndSave.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/SparkCollectAndSave.java @@ -1,4 +1,7 @@ - +/** + * Reads all the entities of the same type (Relation / Results) and saves them in the same folder + * + */ package eu.dnetlib.dhp.oa.graph.dump.graph; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; @@ -75,8 +78,9 @@ public class SparkCollectAndSave implements Serializable { .union(Utils.readPath(spark, inputPath + "/relation/software", Relation.class)) .union(Utils.readPath(spark, inputPath + "/relation/contextOrg", Relation.class)) .union(Utils.readPath(spark, inputPath + "/relation/context", Relation.class)) + .union(Utils.readPath(spark, inputPath + "/relation/relation", Relation.class)) .write() - .mode(SaveMode.Append) + .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(outputPath + "/relation"); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/SparkDumpEntitiesJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/SparkDumpEntitiesJob.java index 63caa4f692..e13dbf1742 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/SparkDumpEntitiesJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/SparkDumpEntitiesJob.java @@ -1,17 +1,14 @@ - +/** + * Spark Job that fires the dump for the entites + */ package eu.dnetlib.dhp.oa.graph.dump.graph; import java.io.Serializable; import java.util.Optional; - import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.oa.graph.dump.QueryInformationSystem; -import eu.dnetlib.dhp.oa.graph.dump.Utils; -import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap; import eu.dnetlib.dhp.schema.oaf.OafEntity; public class SparkDumpEntitiesJob implements Serializable { @@ -44,18 +41,10 @@ public class SparkDumpEntitiesJob implements Serializable { final String communityMapPath = parser.get("communityMapPath"); - final String isLookUpUrl = parser.get("isLookUpUrl"); - log.info("isLookUpUrl: {}", isLookUpUrl); - Class inputClazz = (Class) Class.forName(resultClassName); - QueryInformationSystem queryInformationSystem = new QueryInformationSystem(); - queryInformationSystem.setIsLookUp(Utils.getIsLookUpService(isLookUpUrl)); - CommunityMap communityMap = queryInformationSystem.getCommunityMap(); - DumpGraphEntities dg = new DumpGraphEntities(); dg.run(isSparkSessionManaged, inputPath, outputPath, inputClazz, communityMapPath); - // dg.run(isSparkSessionManaged, inputPath, outputPath, inputClazz, communityMap); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/SparkDumpRelationJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/SparkDumpRelationJob.java index 3b9497ccdb..75e1ffcef8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/SparkDumpRelationJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/SparkDumpRelationJob.java @@ -1,4 +1,6 @@ - +/** + * Dumps eu.dnetlib.dhp.schema.oaf.Relation in eu.dnetlib.dhp.schema.dump.oaf.graph.Relation + */ package eu.dnetlib.dhp.oa.graph.dump.graph; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/SparkExtractRelationFromEntities.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/SparkExtractRelationFromEntities.java index 16c7c7d328..b165fbb14f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/SparkExtractRelationFromEntities.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/SparkExtractRelationFromEntities.java @@ -1,4 +1,6 @@ - +/** + * Spark job that fires the extraction of relations from entities + */ package eu.dnetlib.dhp.oa.graph.dump.graph; import java.io.Serializable; @@ -42,20 +44,12 @@ public class SparkExtractRelationFromEntities implements Serializable { final String resultClassName = parser.get("resultTableName"); log.info("resultTableName: {}", resultClassName); -// final String communityMapPath = parser.get("communityMapPath"); - - final String isLookUpUrl = parser.get("isLookUpUrl"); - log.info("isLookUpUrl: {}", isLookUpUrl); + final String communityMapPath = parser.get("communityMapPath"); Class inputClazz = (Class) Class.forName(resultClassName); - QueryInformationSystem queryInformationSystem = new QueryInformationSystem(); - queryInformationSystem.setIsLookUp(Utils.getIsLookUpService(isLookUpUrl)); - CommunityMap communityMap = queryInformationSystem.getCommunityMap(); - Extractor extractor = new Extractor(); - // extractor.run(isSparkSessionManaged, inputPath, outputPath, inputClazz, communityMapPath); - extractor.run(isSparkSessionManaged, inputPath, outputPath, inputClazz, communityMap); + extractor.run(isSparkSessionManaged, inputPath, outputPath, inputClazz, communityMapPath); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/SparkOrganizationRelation.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/SparkOrganizationRelation.java index f5ad0615d1..db9fb85453 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/SparkOrganizationRelation.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/graph/SparkOrganizationRelation.java @@ -1,4 +1,7 @@ - +/** + * Create new Relations between Context Entities and Organizations whose products are associated to the context. + * It produces relation such as: organization <-> isRelatedTo <-> context + */ package eu.dnetlib.dhp.oa.graph.dump.graph; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;