From 7ba586d2e587c117aacf50fb6e1e9f9b034df35c Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 17 Dec 2019 16:24:49 +0100 Subject: [PATCH 01/45] oozie workflow aimed to build the adjacency lists representation of the graph, needed to build the records to be indexed --- .../job-override.properties | 3 + dhp-workflows/dhp-graph-provision/pom.xml | 37 +++++++ .../java/eu/dnetlib/dhp/graph/EntityNode.java | 4 + .../dnetlib/dhp/graph/GraphMappingUtils.java | 23 ++++ .../eu/dnetlib/dhp/graph/RelatedEntity.java | 69 ++++++++++++ .../dhp/graph/SparkGraphIndexingJob.java | 102 ++++++++++++++++++ .../dhp/graph/input_graph_parameters.json | 5 + .../dhp/graph/oozie_app/config-default.xml | 26 +++++ .../dnetlib/dhp/graph/oozie_app/workflow.xml | 46 ++++++++ 9 files changed, 315 insertions(+) create mode 100644 dhp-workflows/dhp-graph-provision/job-override.properties create mode 100644 dhp-workflows/dhp-graph-provision/pom.xml create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityNode.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/RelatedEntity.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkGraphIndexingJob.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json create mode 100644 dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-graph-provision/job-override.properties b/dhp-workflows/dhp-graph-provision/job-override.properties new file mode 100644 index 000000000..31f7f88f5 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/job-override.properties @@ -0,0 +1,3 @@ +sparkDriverMemory=16G +sparkExecutorMemory=16G +hive_db_name=claudio \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml new file mode 100644 index 000000000..d47463774 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/pom.xml @@ -0,0 +1,37 @@ + + + + dhp-workflows + eu.dnetlib.dhp + 1.0.5-SNAPSHOT + + 4.0.0 + + dhp-graph-provision + + + + + org.apache.spark + spark-core_2.11 + + + org.apache.spark + spark-sql_2.11 + + + + eu.dnetlib.dhp + dhp-common + ${project.version} + + + eu.dnetlib.dhp + dhp-schemas + ${project.version} + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityNode.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityNode.java new file mode 100644 index 000000000..be1babae2 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityNode.java @@ -0,0 +1,4 @@ +package eu.dnetlib.dhp.graph; + +public class EntityNode { +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java new file mode 100644 index 000000000..ab19ff2b5 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java @@ -0,0 +1,23 @@ +package eu.dnetlib.dhp.graph; + +import com.google.common.collect.Maps; +import eu.dnetlib.dhp.schema.oaf.*; + +import java.util.Map; + +public class GraphMappingUtils { + + public final static Map types = Maps.newHashMap(); + + static { + types.put("datasource", Datasource.class); + types.put("organization", Organization.class); + types.put("project", Project.class); + types.put("dataset", Dataset.class); + types.put("otherresearchproduct", OtherResearchProduct.class); + types.put("software", Software.class); + types.put("publication", Publication.class); + types.put("relation", Relation.class); + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/RelatedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/RelatedEntity.java new file mode 100644 index 000000000..dbab04f16 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/RelatedEntity.java @@ -0,0 +1,69 @@ +package eu.dnetlib.dhp.graph; + +import java.io.Serializable; + +public class RelatedEntity implements Serializable { + + private String relType; + + private String subRelType; + + private String relClass; + + private String type; + + private String payload; + + public RelatedEntity(String relType, String subRelType, String relClass, String type, String payload) { + this.relType = relType; + this.subRelType = subRelType; + this.relClass = relClass; + this.type = type; + this.payload = payload; + } + + public String getRelType() { + return relType; + } + + public RelatedEntity setRelType(String relType) { + this.relType = relType; + return this; + } + + public String getSubRelType() { + return subRelType; + } + + public RelatedEntity setSubRelType(String subRelType) { + this.subRelType = subRelType; + return this; + } + + public String getRelClass() { + return relClass; + } + + public RelatedEntity setRelClass(String relClass) { + this.relClass = relClass; + return this; + } + + public String getType() { + return type; + } + + public RelatedEntity setType(String type) { + this.type = type; + return this; + } + + public String getPayload() { + return payload; + } + + public RelatedEntity setPayload(String payload) { + this.payload = payload; + return this; + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkGraphIndexingJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkGraphIndexingJob.java new file mode 100644 index 000000000..04711efbd --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkGraphIndexingJob.java @@ -0,0 +1,102 @@ +package eu.dnetlib.dhp.graph; + +import com.google.common.collect.Sets; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.EntityPayload; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import scala.Tuple2; +import scala.runtime.AbstractFunction1; + +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.apache.commons.lang3.StringUtils.substringAfter; +import static org.apache.commons.lang3.StringUtils.substringBefore; +import static org.apache.spark.sql.Encoders.bean; + +public class SparkGraphIndexingJob { + + private final static String ENTITY_NODES_PATH = "/tmp/entity_node"; + private static final long LIMIT = 100; + + public static void main(String[] args) throws Exception { + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGraphIndexingJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_graph_parameters.json"))); + parser.parseArgument(args); + final SparkSession spark = SparkSession + .builder() + .appName(SparkGraphIndexingJob.class.getSimpleName()) + .master(parser.get("master")) + .config("hive.metastore.uris", parser.get("hive_metastore_uris")) + .config("spark.driver.cores", 1) + .config("spark.executor.cores", 1) + .config("spark.yarn.executor.memoryOverhead", "4G") + .config("spark.yarn.driver.memoryOverhead", "4G") + .enableHiveSupport() + .getOrCreate(); + + final String hiveDbName = parser.get("hive_db_name"); + + final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); + if (fs.exists(new Path(ENTITY_NODES_PATH))) { + fs.delete(new Path(ENTITY_NODES_PATH), true); + } + + spark + .sql(getJoinEntitiesSQL(hiveDbName)) + .transform(toEntityNode()) + /* + .map((MapFunction) r -> { + return null; + }, bean(String.class)) + */ + .rdd() + + .saveAsTextFile(ENTITY_NODES_PATH, GzipCodec.class); + } + + private static AbstractFunction1, Dataset> toEntityNode() { + return new AbstractFunction1, Dataset>() { + @Override + public Dataset apply(Dataset d) { + return d.map((MapFunction) r -> { + + final List res = r.getList(r.fieldIndex("related_entity")); + final byte[] payload = r.getAs("payload"); + return new EntityNode(r.getAs("id"), r.getAs("type"), new String(payload)) + .setRelatedEntities(res + .stream() + .map(re -> new Tuple2<>(substringBefore(re, "@@"), substringAfter(re, "@@"))) + .map(re -> new RelatedEntity(r.getAs("reltype"), r.getAs("subreltype"), r.getAs("relclass"), re._1(), re._2())) + .limit(LIMIT) + .collect(Collectors.toList())); + + }, bean(EntityNode.class)); + } + }; + } + + private static String getJoinEntitiesSQL(String hiveDbName) { + return String.format( + "SELECT " + + "E_s.id AS id, " + + "E_s.type AS type, " + + "E_s.payload AS payload, " + + "r.reltype AS reltype, r.subreltype AS subreltype, r.relclass AS relclass, " + + "collect_list(concat(E_t.type, '@@', E_t.payload)) AS related_entity " + + "FROM %s.entities " + "" /*"TABLESAMPLE(0.1 PERCENT) "*/ + "E_s " + + "LEFT JOIN %s.relation r ON (r.source = E_s.id) " + + "JOIN %s.entities E_t ON (E_t.id = r.target) \n" + + "GROUP BY E_s.id, E_s.type, E_s.payload, r.reltype, r.subreltype, r.relclass", hiveDbName, hiveDbName, hiveDbName); + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json new file mode 100644 index 000000000..613389d79 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json @@ -0,0 +1,5 @@ +[ + {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, + {"paramName":"h", "paramLongName":"hive_metastore_uris","paramDescription": "the hive metastore uris", "paramRequired": true}, + {"paramName":"db", "paramLongName":"hive_db_name", "paramDescription": "the target hive database name", "paramRequired": true} +] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml new file mode 100644 index 000000000..fcab9dd00 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml @@ -0,0 +1,26 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hive_db_name + openaire + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml new file mode 100644 index 000000000..473b697cd --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml @@ -0,0 +1,46 @@ + + + + hive_db_name + the target hive database name + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + GraphIndexing + eu.dnetlib.dhp.graph.SparkGraphIndexingJob + dhp-graph-provision-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" + -mt yarn-cluster + --hive_db_name${hive_db_name} + --hive_metastore_uris${hive_metastore_uris} + + + + + + + \ No newline at end of file From f7b9a7a9af1edff432147b6cced9a6c1f1c42c9d Mon Sep 17 00:00:00 2001 From: Michele Artini Date: Fri, 10 Jan 2020 15:55:23 +0100 Subject: [PATCH 02/45] entity migration (partial implementation) --- dhp-workflows/dhp-aggregation/pom.xml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index 328e783c4..d031c0308 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -25,6 +25,12 @@ dhp-common ${project.version} + + + eu.dnetlib.dhp + dhp-schemas + ${project.version} + net.sf.saxon @@ -44,6 +50,11 @@ jaxen jaxen + + + org.mongodb + mongo-java-driver + org.mockito From 97c239ee0d4a9fbfd7d70dd06ff1f7777de7e881 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 16 Jan 2020 12:02:28 +0200 Subject: [PATCH 03/45] WIP: trying to find a way to build the records for the index --- dhp-schemas/pom.xml | 5 + .../java/eu/dnetlib/dhp/schema/oaf/Oaf.java | 13 ++ .../job-override.properties | 8 +- .../eu/dnetlib/dhp/graph/EntityRelEntity.java | 53 +++++++ .../eu/dnetlib/dhp/graph/GraphJoiner.java | 139 ++++++++++++++++++ .../dhp/graph/SparkGraphIndexingJob.java | 70 +-------- .../java/eu/dnetlib/dhp/graph/TypedRow.java | 44 ++++++ .../dhp/graph/input_graph_parameters.json | 3 +- .../dnetlib/dhp/graph/oozie_app/workflow.xml | 3 +- 9 files changed, 265 insertions(+), 73 deletions(-) create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityRelEntity.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TypedRow.java diff --git a/dhp-schemas/pom.xml b/dhp-schemas/pom.xml index 20896a61d..491cbe668 100644 --- a/dhp-schemas/pom.xml +++ b/dhp-schemas/pom.xml @@ -26,6 +26,11 @@ commons-lang3 + + com.fasterxml.jackson.core + jackson-databind + + eu.dnetlib.dhp dhp-common diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java index 352ebbc6e..010633ec3 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java @@ -1,5 +1,8 @@ package eu.dnetlib.dhp.schema.oaf; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; + import java.io.Serializable; public abstract class Oaf implements Serializable { @@ -23,4 +26,14 @@ public abstract class Oaf implements Serializable { public void setLastupdatetimestamp(Long lastupdatetimestamp) { this.lastupdatetimestamp = lastupdatetimestamp; } + + @Override + public String toString() { + try { + return new ObjectMapper().writeValueAsString(this); + } catch (JsonProcessingException e) { + throw new RuntimeException(e); + } + } + } diff --git a/dhp-workflows/dhp-graph-provision/job-override.properties b/dhp-workflows/dhp-graph-provision/job-override.properties index 31f7f88f5..882053c1a 100644 --- a/dhp-workflows/dhp-graph-provision/job-override.properties +++ b/dhp-workflows/dhp-graph-provision/job-override.properties @@ -1,3 +1,5 @@ -sparkDriverMemory=16G -sparkExecutorMemory=16G -hive_db_name=claudio \ No newline at end of file +sparkDriverMemory=7G +sparkExecutorMemory=7G +sparkExecutorMemoryOverhead=5G +hive_db_name=claudio +sourcePath=/tmp/db_openaireplus_services_beta.export.2019.11.06 \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityRelEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityRelEntity.java new file mode 100644 index 000000000..ac89e4351 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityRelEntity.java @@ -0,0 +1,53 @@ +package eu.dnetlib.dhp.graph; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.schema.oaf.Relation; + +import java.io.Serializable; + +public class EntityRelEntity implements Serializable { + private TypedRow source; + private Relation relation; + private TypedRow target; + + public EntityRelEntity(TypedRow source) { + this.source = source; + } + + public TypedRow getSource() { + return source; + } + + public EntityRelEntity setSource(TypedRow source) { + this.source = source; + return this; + } + + public Relation getRelation() { + return relation; + } + + public EntityRelEntity setRelation(Relation relation) { + this.relation = relation; + return this; + } + + public TypedRow getTarget() { + return target; + } + + public EntityRelEntity setTarget(TypedRow target) { + this.target = target; + return this; + } + + @Override + public String toString() { + try { + return new ObjectMapper().writeValueAsString(this); + } catch (JsonProcessingException e) { + throw new RuntimeException(e); + } + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java new file mode 100644 index 000000000..5764642dc --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java @@ -0,0 +1,139 @@ +package eu.dnetlib.dhp.graph; + +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.schema.oaf.*; +import org.apache.hadoop.io.Text; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.Optional; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import scala.Tuple2; + +import java.io.Serializable; + +public class GraphJoiner implements Serializable { + + public static final int MAX_RELS = 100; + + public void join(final SparkSession spark, final String inputPath, final String hiveDbName, final String outPath) { + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + /* + JavaPairRDD entities = sc.sequenceFile(inputPath + "/publication", Text.class, Text.class) + .map(item -> new ObjectMapper().readValue(item._2().toString(), Publication.class)) + .map(oaf -> new TypedRow("publication", oaf)) + .mapToPair(toPair()); + + */ + + JavaPairRDD entities = sc.sequenceFile(inputPath + "/datasource", Text.class, Text.class) + .map(item -> new ObjectMapper().readValue(item._2().toString(), Datasource.class)) + .map(oaf -> new TypedRow("datasource", oaf)) + .mapToPair(toPair()) + .union(sc.sequenceFile(inputPath + "/organization", Text.class, Text.class) + .map(item -> new ObjectMapper().readValue(item._2().toString(), Organization.class)) + .map(oaf -> new TypedRow("organization", oaf)) + .mapToPair(toPair())) + .union(sc.sequenceFile(inputPath + "/project", Text.class, Text.class) + .map(item -> new ObjectMapper().readValue(item._2().toString(), Project.class)) + .map(oaf -> new TypedRow("project", oaf)) + .mapToPair(toPair())) + .union(sc.sequenceFile(inputPath + "/dataset", Text.class, Text.class) + .map(item -> new ObjectMapper().readValue(item._2().toString(), Dataset.class)) + .map(oaf -> new TypedRow("dataset", oaf)) + .mapToPair(toPair())) + .union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class) + .map(item -> new ObjectMapper().readValue(item._2().toString(), OtherResearchProduct.class)) + .map(oaf -> new TypedRow("otherresearchproduct", oaf)) + .mapToPair(toPair())) + .union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class) + .map(item -> new ObjectMapper().readValue(item._2().toString(), Software.class)) + .map(oaf -> new TypedRow("software", oaf)) + .mapToPair(toPair())); + /* + .union(sc.sequenceFile(inputPath + "/publication", Text.class, Text.class) + .map(item -> new ObjectMapper().readValue(item._2().toString(), Publication.class)) + .map(oaf -> new TypedRow("publication", oaf)) + .mapToPair(toPair())); + + */ + + /* + JavaRDD rels = sc.sequenceFile(inputPath + "/relation", Text.class, Text.class) + .map(item -> new ObjectMapper().readValue(item._2().toString(), Relation.class)) + .map(oaf -> new TypedRow("relation", oaf)) + .mapToPair(toPair()) + .groupByKey() + .map(t -> Iterables.limit(t._2(), MAX_RELS)) + .flatMap(t -> t.iterator()) + .map(t -> (Relation) t.getOaf()); + + spark.createDataset(rels.rdd(), Encoders.bean(Relation.class)) + .write() + .mode(SaveMode.Overwrite) + .saveAsTable(hiveDbName + ".relation_100"); + */ + + JavaPairRDD bounded_rels = spark.table(hiveDbName + ".relation_" + MAX_RELS) + .as(Encoders.bean(Relation.class)) + .javaRDD() + .map(r -> new TypedRow("relation", r)) + .mapToPair(toPair()); + + // build the adjacency list: e -> r + JavaPairRDD>> adjacency_list = entities.leftOuterJoin(bounded_rels); + + JavaRDD linked_entities = adjacency_list + .mapToPair(toPairTarget()) // make rel.targetid explicit so that we can join it + .leftOuterJoin(entities) // again with the entities to get the target entity + .map(l -> toEntityRelEntity(l)); // and map it to a more readable representation + + spark.createDataFrame(linked_entities, EntityRelEntity.class) + .write() + .mode(SaveMode.Overwrite) + .saveAsTable(hiveDbName + ".linked_entities"); + } + + private EntityRelEntity toEntityRelEntity(Tuple2>>, Optional>> l) { + // extract the entity source + final EntityRelEntity res = new EntityRelEntity(l._2()._1()._2()._1()); + + if(l._2()._1()._2()._2().isPresent() && l._2()._2().isPresent()) { + + // extract the relationship + res.setRelation((Relation) l._2()._1()._2()._2().get().getOaf()); + + // extract the related entity + res.setTarget(l._2()._2().get()); + } + + return res; + } + + private PairFunction>>, String, Tuple2>>> toPairTarget() { + return e -> { + Optional o = e._2()._2(); + if (o.isPresent()) { + return new Tuple2<>(((Relation) o.get().getOaf()).getTarget(), e); + } else { + return new Tuple2<>(null, e); + } + }; + } + + private PairFunction toPair() { + return e -> { + if (!"relation".equals(e.getType())) { + return new Tuple2<>( ((OafEntity) e.getOaf()).getId(), e); + } else { + return new Tuple2<>( ((Relation) e.getOaf()).getSource(), e); + } + }; + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkGraphIndexingJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkGraphIndexingJob.java index 04711efbd..ce8e7e396 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkGraphIndexingJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkGraphIndexingJob.java @@ -1,32 +1,14 @@ package eu.dnetlib.dhp.graph; -import com.google.common.collect.Sets; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.common.EntityPayload; import org.apache.commons.io.IOUtils; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.compress.GzipCodec; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; -import scala.Tuple2; -import scala.runtime.AbstractFunction1; - -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; - -import static org.apache.commons.lang3.StringUtils.substringAfter; -import static org.apache.commons.lang3.StringUtils.substringBefore; -import static org.apache.spark.sql.Encoders.bean; public class SparkGraphIndexingJob { private final static String ENTITY_NODES_PATH = "/tmp/entity_node"; - private static final long LIMIT = 100; public static void main(String[] args) throws Exception { @@ -37,13 +19,10 @@ public class SparkGraphIndexingJob { .appName(SparkGraphIndexingJob.class.getSimpleName()) .master(parser.get("master")) .config("hive.metastore.uris", parser.get("hive_metastore_uris")) - .config("spark.driver.cores", 1) - .config("spark.executor.cores", 1) - .config("spark.yarn.executor.memoryOverhead", "4G") - .config("spark.yarn.driver.memoryOverhead", "4G") .enableHiveSupport() .getOrCreate(); + final String inputPath = parser.get("sourcePath"); final String hiveDbName = parser.get("hive_db_name"); final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); @@ -51,52 +30,7 @@ public class SparkGraphIndexingJob { fs.delete(new Path(ENTITY_NODES_PATH), true); } - spark - .sql(getJoinEntitiesSQL(hiveDbName)) - .transform(toEntityNode()) - /* - .map((MapFunction) r -> { - return null; - }, bean(String.class)) - */ - .rdd() - - .saveAsTextFile(ENTITY_NODES_PATH, GzipCodec.class); - } - - private static AbstractFunction1, Dataset> toEntityNode() { - return new AbstractFunction1, Dataset>() { - @Override - public Dataset apply(Dataset d) { - return d.map((MapFunction) r -> { - - final List res = r.getList(r.fieldIndex("related_entity")); - final byte[] payload = r.getAs("payload"); - return new EntityNode(r.getAs("id"), r.getAs("type"), new String(payload)) - .setRelatedEntities(res - .stream() - .map(re -> new Tuple2<>(substringBefore(re, "@@"), substringAfter(re, "@@"))) - .map(re -> new RelatedEntity(r.getAs("reltype"), r.getAs("subreltype"), r.getAs("relclass"), re._1(), re._2())) - .limit(LIMIT) - .collect(Collectors.toList())); - - }, bean(EntityNode.class)); - } - }; - } - - private static String getJoinEntitiesSQL(String hiveDbName) { - return String.format( - "SELECT " + - "E_s.id AS id, " + - "E_s.type AS type, " + - "E_s.payload AS payload, " + - "r.reltype AS reltype, r.subreltype AS subreltype, r.relclass AS relclass, " + - "collect_list(concat(E_t.type, '@@', E_t.payload)) AS related_entity " + - "FROM %s.entities " + "" /*"TABLESAMPLE(0.1 PERCENT) "*/ + "E_s " + - "LEFT JOIN %s.relation r ON (r.source = E_s.id) " + - "JOIN %s.entities E_t ON (E_t.id = r.target) \n" + - "GROUP BY E_s.id, E_s.type, E_s.payload, r.reltype, r.subreltype, r.relclass", hiveDbName, hiveDbName, hiveDbName); + new GraphJoiner().join(spark, inputPath, hiveDbName, ENTITY_NODES_PATH); } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TypedRow.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TypedRow.java new file mode 100644 index 000000000..5c933ca80 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TypedRow.java @@ -0,0 +1,44 @@ +package eu.dnetlib.dhp.graph; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.schema.oaf.Oaf; + +import java.io.Serializable; + +public class TypedRow implements Serializable { + private String type; + private Oaf oaf; + + public TypedRow(String type, Oaf oaf) { + this.type = type; + this.oaf = oaf; + } + + public String getType() { + return type; + } + + public TypedRow setType(String type) { + this.type = type; + return this; + } + + public Oaf getOaf() { + return oaf; + } + + public TypedRow setOaf(Oaf oaf) { + this.oaf = oaf; + return this; + } + + @Override + public String toString() { + try { + return new ObjectMapper().writeValueAsString(this); + } catch (JsonProcessingException e) { + throw new RuntimeException(e); + } + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json index 613389d79..a197abc78 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json @@ -1,5 +1,6 @@ [ {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, {"paramName":"h", "paramLongName":"hive_metastore_uris","paramDescription": "the hive metastore uris", "paramRequired": true}, - {"paramName":"db", "paramLongName":"hive_db_name", "paramDescription": "the target hive database name", "paramRequired": true} + {"paramName":"db", "paramLongName":"hive_db_name", "paramDescription": "the target hive database name", "paramRequired": true}, + {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true} ] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml index 473b697cd..00a890268 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml @@ -33,8 +33,9 @@ GraphIndexing eu.dnetlib.dhp.graph.SparkGraphIndexingJob dhp-graph-provision-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" --conf spark.yarn.executor.memoryOverhead=${sparkExecutorMemoryOverhead} -mt yarn-cluster + --sourcePath${sourcePath} --hive_db_name${hive_db_name} --hive_metastore_uris${hive_metastore_uris} From 63c0db4ff8b8d62a062088d065003680d65bedf4 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 16 Jan 2020 15:54:53 +0200 Subject: [PATCH 04/45] instance URLs must be repeatable --- dhp-schemas/pom.xml | 2 +- .../src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/dhp-schemas/pom.xml b/dhp-schemas/pom.xml index 491cbe668..ec5af8d3c 100644 --- a/dhp-schemas/pom.xml +++ b/dhp-schemas/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib.dhp dhp - 1.0.5-SNAPSHOT + 1.1.5-SNAPSHOT ../ diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java index f27704c5c..8726b85ce 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; +import java.util.List; public class Instance implements Serializable { @@ -12,7 +13,7 @@ public class Instance implements Serializable { private KeyValue hostedby; - private String url; + private List url; // other research products specifc private String distributionlocation; @@ -53,11 +54,11 @@ public class Instance implements Serializable { this.hostedby = hostedby; } - public String getUrl() { + public List getUrl() { return url; } - public void setUrl(String url) { + public void setUrl(List url) { this.url = url; } From 749b0660abbb44b0fa96eff27fd986e15b0daeeb Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 17 Jan 2020 14:22:15 +0100 Subject: [PATCH 05/45] instance URLs must be repeatable --- .../src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java index ceaaad6b0..8f852af65 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; +import java.util.List; public class Instance implements Serializable { @@ -12,7 +13,7 @@ public class Instance implements Serializable { private KeyValue hostedby; - private String url; + private List url; // other research products specifc private String distributionlocation; @@ -53,11 +54,11 @@ public class Instance implements Serializable { this.hostedby = hostedby; } - public String getUrl() { + public List getUrl() { return url; } - public void setUrl(String url) { + public void setUrl(List url) { this.url = url; } From 81f82b5d34d67d61017d546fe761daf0b717ca27 Mon Sep 17 00:00:00 2001 From: Michele Artini Date: Fri, 17 Jan 2020 15:26:21 +0100 Subject: [PATCH 06/45] partial implementation of applications to migrate entities --- .../migration/AbstractMigrateApplication.java | 61 +++ .../eu/dnetlib/dhp/migration/DbClient.java | 58 +++ .../dnetlib/dhp/migration/MdstoreClient.java | 87 ++++ .../MigrateDbEntitiesApplication.java | 390 ++++++++++++++++++ .../MigrateMongoMdstoresApplication.java | 190 +++++++++ .../dnetlib/dhp/migration/MigrationUtils.java | 164 ++++++++ .../migrate_db_entities_parameters.json | 38 ++ .../migrate_mongo_mstores_parameters.json | 50 +++ .../sql/queryDatasourceOrganization.sql | 16 + .../dhp/migration/sql/queryDatasources.sql | 147 +++++++ .../dhp/migration/sql/queryOrganizations.sql | 36 ++ .../sql/queryOrganizationsFromOpenOrgsDB.sql | 53 +++ .../sql/queryProjectOrganization.sql | 16 + .../dhp/migration/sql/queryProjects.sql | 87 ++++ .../sql/querySimilarityFromOpenOrgsDB.sql | 17 + 15 files changed, 1410 insertions(+) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrateApplication.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/DbClient.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MdstoreClient.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrationUtils.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasourceOrganization.sql create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasources.sql create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizations.sql create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizationsFromOpenOrgsDB.sql create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjectOrganization.sql create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects.sql create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/querySimilarityFromOpenOrgsDB.sql diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrateApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrateApplication.java new file mode 100644 index 000000000..a5c8b2775 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrateApplication.java @@ -0,0 +1,61 @@ +package eu.dnetlib.dhp.migration; + +import java.io.Closeable; +import java.io.IOException; +import java.net.URI; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.codehaus.jackson.map.ObjectMapper; + +import eu.dnetlib.dhp.schema.oaf.Oaf; + +public class AbstractMigrateApplication implements Closeable { + + private final AtomicInteger counter = new AtomicInteger(0); + + private final IntWritable key = new IntWritable(counter.get()); + + private final Text value = new Text(); + + private final ObjectMapper objectMapper = new ObjectMapper(); + + private final SequenceFile.Writer writer; + + public AbstractMigrateApplication(final String hdfsPath, final String hdfsNameNode, final String hdfsUser) throws Exception { + this.writer = SequenceFile.createWriter(getConf(hdfsNameNode, hdfsUser), SequenceFile.Writer.file(new Path(hdfsPath)), SequenceFile.Writer + .keyClass(IntWritable.class), SequenceFile.Writer.valueClass(Text.class)); + } + + private Configuration getConf(final String hdfsNameNode, final String hdfsUser) throws IOException { + final Configuration conf = new Configuration(); + conf.set("fs.defaultFS", hdfsNameNode); + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + System.setProperty("HADOOP_USER_NAME", hdfsUser); + System.setProperty("hadoop.home.dir", "/"); + FileSystem.get(URI.create(hdfsNameNode), conf); + return conf; + } + + protected void emitOaf(final Oaf oaf) { + try { + key.set(counter.getAndIncrement()); + value.set(objectMapper.writeValueAsString(oaf)); + writer.append(key, value); + } catch (final Exception e) { + e.printStackTrace(); + } + } + + @Override + public void close() throws IOException { + writer.close(); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/DbClient.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/DbClient.java new file mode 100644 index 000000000..e9fee63b9 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/DbClient.java @@ -0,0 +1,58 @@ +package eu.dnetlib.dhp.migration; + +import java.io.Closeable; +import java.io.IOException; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.function.Consumer; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class DbClient implements Closeable { + + private static final Log log = LogFactory.getLog(DbClient.class); + + private Connection connection; + + public DbClient(final String address, final String login, final String password) { + + try { + Class.forName("org.postgresql.Driver"); + this.connection = DriverManager.getConnection(address, login, password); + this.connection.setAutoCommit(false); + } catch (final Exception e) { + log.error(e.getClass().getName() + ": " + e.getMessage()); + throw new RuntimeException(e); + } + log.info("Opened database successfully"); + } + + public void processResults(final String sql, final Consumer consumer) { + + try (final Statement stmt = connection.createStatement()) { + try (final ResultSet rs = stmt.executeQuery("SELECT * FROM COMPANY;")) { + while (rs.next()) { + consumer.accept(rs); + } + } catch (final SQLException e) { + throw new RuntimeException(e); + } + } catch (final SQLException e1) { + throw new RuntimeException(e1); + } + } + + @Override + public void close() throws IOException { + try { + connection.close(); + } catch (final SQLException e) { + throw new RuntimeException(e); + } + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MdstoreClient.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MdstoreClient.java new file mode 100644 index 000000000..971d7f165 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MdstoreClient.java @@ -0,0 +1,87 @@ +package eu.dnetlib.dhp.migration; + +import java.io.Closeable; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.stream.StreamSupport; + +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.bson.Document; + +import com.google.common.collect.Iterables; +import com.mongodb.MongoClient; +import com.mongodb.MongoClientURI; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; + +public class MdstoreClient implements Closeable { + + private final MongoClient client; + private final MongoDatabase db; + + private static final String COLL_METADATA = "metadata"; + private static final String COLL_METADATA_MANAGER = "metadataManager"; + + private static final Log log = LogFactory.getLog(MdstoreClient.class); + + public MdstoreClient(final String baseUrl, final String dbName) { + this.client = new MongoClient(new MongoClientURI(baseUrl)); + this.db = getDb(client, dbName); + } + + public Map validCollections(final String mdFormat, final String mdLayout, final String mdInterpretation) { + + final Map transactions = new HashMap<>(); + for (final Document entry : getColl(db, COLL_METADATA_MANAGER).find()) { + final String mdId = entry.getString("mdId"); + final String currentId = entry.getString("currentId"); + if (StringUtils.isNoneBlank(mdId, currentId)) { + transactions.put(mdId, currentId); + } + } + + final Map res = new HashMap<>(); + for (final Document entry : getColl(db, COLL_METADATA).find()) { + if (entry.getString("format").equals(mdFormat) && entry.getString("layout").equals(mdLayout) + && entry.getString("interpretation").equals(mdInterpretation) && transactions.containsKey(entry.getString("mdId"))) { + res.put(entry.getString("mdId"), transactions.get(entry.getString("mdId"))); + } + } + + return res; + } + + private MongoDatabase getDb(final MongoClient client, final String dbName) { + if (!Iterables.contains(client.listDatabaseNames(), dbName)) { + final String err = String.format("Database '%s' not found in %s", dbName, client.getAddress()); + log.warn(err); + throw new RuntimeException(err); + } + return client.getDatabase(dbName); + } + + private MongoCollection getColl(final MongoDatabase db, final String collName) { + if (!Iterables.contains(db.listCollectionNames(), collName)) { + final String err = String.format(String.format("Missing collection '%s' in database '%s'", collName, db.getName())); + log.warn(err); + throw new RuntimeException(err); + } + return db.getCollection(collName); + } + + public Iterable listRecords(final String coll) { + return () -> StreamSupport.stream(getColl(db, coll).find().spliterator(), false) + .filter(e -> e.containsKey("body")) + .map(e -> e.getString("body")) + .iterator(); + } + + @Override + public void close() throws IOException { + client.close(); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java new file mode 100644 index 000000000..60a7c24f7 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java @@ -0,0 +1,390 @@ +package eu.dnetlib.dhp.migration; + +import java.io.Closeable; +import java.io.IOException; +import java.sql.ResultSet; +import java.util.Arrays; +import java.util.function.Consumer; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class MigrateDbEntitiesApplication extends AbstractMigrateApplication implements Closeable { + + private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class); + + private final DbClient dbClient; + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils.toString(MigrateDbEntitiesApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json"))); + + parser.parseArgument(args); + + final String dbUrl = parser.get("postgresUrl"); + final String dbUser = parser.get("postgresUser"); + final String dbPassword = parser.get("postgresPassword"); + + final String hdfsPath = parser.get("hdfsPath"); + final String hdfsNameNode = parser.get("namenode"); + final String hdfsUser = parser.get("hdfsUser"); + + try (final MigrateDbEntitiesApplication smdbe = new MigrateDbEntitiesApplication(hdfsPath, hdfsNameNode, hdfsUser, dbUrl, dbUser, dbPassword)) { + smdbe.execute("queryDatasources.sql", smdbe::processDatasource); + smdbe.execute("queryProjects.sql", smdbe::processProject); + smdbe.execute("queryOrganizations.sql", smdbe::processOrganization); + smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization); + smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization); + } + + } + + public MigrateDbEntitiesApplication(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String dbUrl, final String dbUser, + final String dbPassword) throws Exception { + super(hdfsPath, hdfsNameNode, hdfsUser); + this.dbClient = new DbClient(dbUrl, dbUser, dbPassword); + } + + public void execute(final String sqlFile, final Consumer consumer) throws Exception { + final String sql = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/migration/sql/" + sqlFile)); + dbClient.processResults(sql, consumer); + } + + public void processDatasource(final ResultSet rs) { + try { + + final DataInfo info = MigrationUtils.dataInfo(null, null, null, null, null, null); // TODO + + final Datasource ds = new Datasource(); + + ds.setId(MigrationUtils.createOpenaireId("10", rs.getString("datasourceid"))); + ds.setOriginalId(Arrays.asList(rs.getString("datasourceid"))); + ds.setCollectedfrom(MigrationUtils.listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); + ds.setPid(null); // List // TODO + ds.setDateofcollection(rs.getDate("dateofcollection").toString()); + ds.setDateoftransformation(null); // TODO + ds.setExtraInfo(null); // TODO + ds.setOaiprovenance(null); // TODO + + ds.setDatasourcetype(null); // Qualifier datasourcetype) { + ds.setOpenairecompatibility(null); // Qualifier openairecompatibility) { + ds.setOfficialname(MigrationUtils.field(rs.getString("officialname"), info)); + ds.setEnglishname(MigrationUtils.field(rs.getString("englishname"), info)); + ds.setWebsiteurl(MigrationUtils.field(rs.getString("websiteurl"), info)); + ds.setLogourl(MigrationUtils.field(rs.getString("logourl"), info)); + ds.setContactemail(MigrationUtils.field(rs.getString("contactemail"), info)); + ds.setNamespaceprefix(MigrationUtils.field(rs.getString("namespaceprefix"), info)); + ds.setLatitude(MigrationUtils.field(Double.toString(rs.getDouble("latitude")), info)); + ds.setLongitude(MigrationUtils.field(Double.toString(rs.getDouble("longitude")), info)); + ds.setDateofvalidation(MigrationUtils.field(rs.getDate("dateofvalidation").toString(), info)); + ds.setDescription(MigrationUtils.field(rs.getString("description"), info)); + ds.setSubjects(null); // List subjects) { + ds.setOdnumberofitems(MigrationUtils.field(Double.toString(rs.getInt("odnumberofitems")), info)); + ds.setOdnumberofitemsdate(MigrationUtils.field(rs.getDate("odnumberofitemsdate").toString(), info)); + ds.setOdpolicies(MigrationUtils.field(rs.getString("odpolicies"), info)); + ds.setOdlanguages(MigrationUtils.listFields(info, rs.getArray("odlanguages"))); + ds.setOdcontenttypes(MigrationUtils.listFields(info, rs.getArray("odcontenttypes"))); + ds.setAccessinfopackage(MigrationUtils.listFields(info, rs.getArray("accessinfopackage"))); + ds.setReleasestartdate(MigrationUtils.field(rs.getDate("releasestartdate").toString(), info)); + ds.setReleaseenddate(MigrationUtils.field(rs.getDate("releaseenddate").toString(), info)); + ds.setMissionstatementurl(MigrationUtils.field(rs.getString("missionstatementurl"), info)); + ds.setDataprovider(MigrationUtils.field(rs.getBoolean("dataprovider"), info)); + ds.setServiceprovider(MigrationUtils.field(rs.getBoolean("serviceprovider"), info)); + ds.setDatabaseaccesstype(MigrationUtils.field(rs.getString("databaseaccesstype"), info)); + ds.setDatauploadtype(MigrationUtils.field(rs.getString("datauploadtype"), info)); + ds.setDatabaseaccessrestriction(MigrationUtils.field(rs.getString("databaseaccessrestriction"), info)); + ds.setDatauploadrestriction(MigrationUtils.field(rs.getString("datauploadrestriction"), info)); + ds.setVersioning(MigrationUtils.field(rs.getBoolean("versioning"), info)); + ds.setCitationguidelineurl(MigrationUtils.field(rs.getString("citationguidelineurl"), info)); + ds.setQualitymanagementkind(MigrationUtils.field(rs.getString("qualitymanagementkind"), info)); + ds.setPidsystems(MigrationUtils.field(rs.getString("pidsystems"), info)); + ds.setCertificates(MigrationUtils.field(rs.getString("certificates"), info)); + ds.setPolicies(null); // List // TODO + ds.setJournal(null); // Journal // TODO + + // rs.getString("datasourceid"); + rs.getArray("identities"); + // rs.getString("officialname"); + // rs.getString("englishname"); + // rs.getString("contactemail"); + rs.getString("openairecompatibility"); // COMPLEX ...@@@... + // rs.getString("websiteurl"); + // rs.getString("logourl"); + // rs.getArray("accessinfopackage"); + // rs.getDouble("latitude"); + // rs.getDouble("longitude"); + // rs.getString("namespaceprefix"); + // rs.getInt("odnumberofitems"); // NULL + // rs.getDate("odnumberofitemsdate"); // NULL + rs.getArray("subjects"); + // rs.getString("description"); + // rs.getString("odpolicies"); // NULL + // rs.getArray("odlanguages"); + // rs.getArray("odcontenttypes"); + rs.getBoolean("inferred"); // false + rs.getBoolean("deletedbyinference");// false + rs.getDouble("trust"); // 0.9 + rs.getString("inferenceprovenance"); // NULL + // rs.getDate("dateofcollection"); + // rs.getDate("dateofvalidation"); + // rs.getDate("releasestartdate"); + // rs.getDate("releaseenddate"); + // rs.getString("missionstatementurl"); + // rs.getBoolean("dataprovider"); + // rs.getBoolean("serviceprovider"); + // rs.getString("databaseaccesstype"); + // rs.getString("datauploadtype"); + // rs.getString("databaseaccessrestriction"); + // rs.getString("datauploadrestriction"); + // rs.getBoolean("versioning"); + // rs.getString("citationguidelineurl"); + // rs.getString("qualitymanagementkind"); + // rs.getString("pidsystems"); + // rs.getString("certificates"); + rs.getArray("policies"); + // rs.getString("collectedfromid"); + // rs.getString("collectedfromname"); + rs.getString("datasourcetype"); // COMPLEX XXX@@@@.... + rs.getString("provenanceaction"); // 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' + // AS provenanceaction, + rs.getString("journal"); // CONCAT(d.issn, '@@@', d.eissn, '@@@', d.lissn) AS journal + + emitOaf(ds); + } catch (final Exception e) { + // TODO: handle exception + } + } + + public void processProject(final ResultSet rs) { + try { + + final DataInfo info = MigrationUtils.dataInfo(null, null, null, null, null, null); // TODO + + final Project p = new Project(); + + p.setId(MigrationUtils.createOpenaireId("40", rs.getString("projectid"))); + p.setOriginalId(Arrays.asList(rs.getString("projectid"))); + p.setCollectedfrom(MigrationUtils.listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); + p.setPid(null); // List // TODO + + p.setDateofcollection(rs.getDate("dateofcollection").toString()); + p.setDateoftransformation(rs.getDate("dateoftransformation").toString()); + p.setExtraInfo(null); // List //TODO + p.setOaiprovenance(null); // OAIProvenance /TODO + + p.setWebsiteurl(MigrationUtils.field(rs.getString("websiteurl"), info)); + p.setCode(MigrationUtils.field(rs.getString("code"), info)); + p.setAcronym(MigrationUtils.field(rs.getString("acronym"), info)); + p.setTitle(MigrationUtils.field(rs.getString("title"), info)); + p.setStartdate(MigrationUtils.field(rs.getDate("startdate").toString(), info)); + p.setEnddate(MigrationUtils.field(rs.getDate("enddate").toString(), info)); + p.setCallidentifier(MigrationUtils.field(rs.getString("callidentifier"), info)); + p.setKeywords(MigrationUtils.field(rs.getString("keywords"), info)); + p.setDuration(MigrationUtils.field(Integer.toString(rs.getInt("duration")), info)); + p.setEcsc39(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecsc39")), info)); + p.setOamandatepublications(MigrationUtils.field(Boolean.toString(rs.getBoolean("oamandatepublications")), info)); + p.setEcarticle29_3(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecarticle29_3")), info)); + p.setSubjects(null); // List //TODO + p.setFundingtree(null); // List> //TODO + p.setContracttype(null); // Qualifier //TODO + p.setOptional1(MigrationUtils.field(rs.getString("optional1"), info)); + p.setOptional2(MigrationUtils.field(rs.getString("optional2"), info)); + p.setJsonextrainfo(MigrationUtils.field(rs.getString("jsonextrainfo"), info)); + p.setContactfullname(MigrationUtils.field(rs.getString("contactfullname"), info)); + p.setContactfax(MigrationUtils.field(rs.getString("contactfax"), info)); + p.setContactphone(MigrationUtils.field(rs.getString("contactphone"), info)); + p.setContactemail(MigrationUtils.field(rs.getString("contactemail"), info)); + p.setSummary(MigrationUtils.field(rs.getString("summary"), info)); + p.setCurrency(MigrationUtils.field(rs.getString("currency"), info)); + p.setTotalcost(new Float(rs.getDouble("totalcost"))); + p.setFundedamount(new Float(rs.getDouble("fundedamount"))); + + // rs.getString("projectid"); + // rs.getString("code"); + // rs.getString("websiteurl"); + // rs.getString("acronym"); + // rs.getString("title"); + // rs.getDate("startdate"); + // rs.getDate("enddate"); + // rs.getString("callidentifier"); + // rs.getString("keywords"); + // rs.getInt("duration"); + // rs.getBoolean("ecsc39"); + // rs.getBoolean("oamandatepublications"); + // rs.getBoolean("ecarticle29_3"); + // rs.getDate("dateofcollection"); + // rs.getDate("dateoftransformation"); + rs.getBoolean("inferred"); + rs.getBoolean("deletedbyinference"); + rs.getDouble("trust"); + rs.getString("inferenceprovenance"); + // rs.getString("optional1"); + // rs.getString("optional2"); + rs.getString("jsonextrainfo"); + // rs.getString("contactfullname"); + // rs.getString("contactfax"); + // rs.getString("contactphone"); + // rs.getString("contactemail"); + // rs.getString("summary"); + // rs.getString("currency"); + // rs.getDouble("totalcost"); + // rs.getDouble("fundedamount"); + // rs.getString("collectedfromid"); + // rs.getString("collectedfromname"); + rs.getString("contracttype"); // COMPLEX + rs.getString("provenanceaction"); // COMPLEX + rs.getArray("pid"); + rs.getArray("subjects"); + rs.getArray("fundingtree"); + + emitOaf(p); + + } catch (final Exception e) { + // TODO: handle exception + } + } + + public void processOrganization(final ResultSet rs) { + try { + + final DataInfo info = MigrationUtils.dataInfo(null, null, null, null, null, null); // TODO + + final Organization o = new Organization(); + + o.setId(MigrationUtils.createOpenaireId("20", rs.getString("organizationid"))); // String id) { + o.setOriginalId(Arrays.asList(rs.getString("organizationid"))); + o.setCollectedfrom(MigrationUtils.listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); + o.setPid(null); // List // TODO + o.setDateofcollection(rs.getDate("dateofcollection").toString()); + o.setDateoftransformation(rs.getDate("dateoftransformation").toString()); + o.setExtraInfo(null); // List // TODO + o.setOaiprovenance(null); // OAIProvenance // TODO + o.setLegalshortname(MigrationUtils.field("legalshortname", info)); + o.setLegalname(MigrationUtils.field("legalname", info)); + o.setAlternativeNames(null); // List> //TODO + o.setWebsiteurl(MigrationUtils.field("websiteurl", info)); + o.setLogourl(MigrationUtils.field("logourl", info)); + o.setEclegalbody(MigrationUtils.field(Boolean.toString(rs.getBoolean("eclegalbody")), info)); + o.setEclegalperson(MigrationUtils.field(Boolean.toString(rs.getBoolean("eclegalperson")), info)); + o.setEcnonprofit(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecnonprofit")), info)); + o.setEcresearchorganization(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecresearchorganization")), info)); + o.setEchighereducation(MigrationUtils.field(Boolean.toString(rs.getBoolean("echighereducation")), info)); + o.setEcinternationalorganizationeurinterests(MigrationUtils + .field(Boolean.toString(rs.getBoolean("ecinternationalorganizationeurinterests")), info)); + o.setEcinternationalorganization(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecinternationalorganization")), info)); + o.setEcenterprise(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecenterprise")), info)); + o.setEcsmevalidated(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info)); + o.setEcnutscode(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecnutscode")), info)); + o.setCountry(null); // Qualifier country) { + + // rs.getString("organizationid"); + // rs.getString("legalshortname"); + // rs.getString("legalname"); + // rs.getString("websiteurl"); + // rs.getString("logourl"); + // rs.getBoolean("eclegalbody"); + // rs.getBoolean("eclegalperson"); + // rs.getBoolean("ecnonprofit"); + // rs.getBoolean("ecresearchorganization"); + // rs.getBoolean("echighereducation"); + // rs.getBoolean("ecinternationalorganizationeurinterests"); + // rs.getBoolean("ecinternationalorganization"); + // rs.getBoolean("ecenterprise"); + // rs.getBoolean("ecsmevalidated"); + // rs.getBoolean("ecnutscode"); + rs.getDate("dateofcollection"); + rs.getDate("dateoftransformation"); + rs.getBoolean("inferred"); + rs.getBoolean("deletedbyinference"); + rs.getDouble("trust"); + rs.getString("inferenceprovenance"); + // rs.getString("collectedfromid"); + // rs.getString("collectedfromname"); + rs.getString("country"); + rs.getString("provenanceaction"); + rs.getArray("pid"); + + emitOaf(o); + } catch (final Exception e) { + // TODO: handle exception + } + } + + public void processDatasourceOrganization(final ResultSet rs) { + + try { + final Relation r = new Relation(); + + r.setRelType(null); // TODO + r.setSubRelType(null); // TODO + r.setRelClass(null); // TODO + r.setSource(null); // TODO + r.setTarget(null); // TODO + r.setCollectedFrom(MigrationUtils.listKeyValues("", "")); + + rs.getString("datasource"); + rs.getString("organization"); + rs.getDate("startdate"); // NULL + rs.getDate("enddate"); // NULL + rs.getBoolean("inferred"); // false + rs.getBoolean("deletedbyinference"); // false + rs.getDouble("trust"); // 0.9 + rs.getString("inferenceprovenance"); // NULL + rs.getString("semantics"); // 'providedBy@@@provided + // by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies' AS + // semantics, + rs.getString("provenanceaction"); // d.provenanceaction || '@@@' || d.provenanceaction || + // '@@@dnet:provenanceActions@@@dnet:provenanceActions' AS provenanceaction + + emitOaf(r); + } catch (final Exception e) { + // TODO: handle exception + } + } + + public void processProjectOrganization(final ResultSet rs) { + try { + final Relation r = new Relation(); + + r.setRelType(null); // TODO + r.setSubRelType(null); // TODO + r.setRelClass(null); // TODO + r.setSource(null); // TODO + r.setTarget(null); // TODO + r.setCollectedFrom(null); + + rs.getString("project"); + rs.getString("resporganization"); + rs.getInt("participantnumber"); + rs.getDouble("contribution"); + rs.getDate("startdate");// null + rs.getDate("enddate");// null + rs.getBoolean("inferred");// false + rs.getBoolean("deletedbyinference"); // false + rs.getDouble("trust"); + rs.getString("inferenceprovenance"); // NULL + rs.getString("semantics"); // po.semanticclass || '@@@' || po.semanticclass || + // '@@@dnet:project_organization_relations@@@dnet:project_organization_relations' AS semantics, + rs.getString("provenanceaction"); // 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' + // AS provenanceaction + emitOaf(r); + } catch (final Exception e) { + // TODO: handle exception + } + } + + @Override + public void close() throws IOException { + super.close(); + dbClient.close(); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java new file mode 100644 index 000000000..cead2366b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java @@ -0,0 +1,190 @@ +package eu.dnetlib.dhp.migration; + +import java.io.Closeable; +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; +import java.util.Map.Entry; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.Node; +import org.dom4j.io.SAXReader; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.Software; + +public class MigrateMongoMdstoresApplication extends AbstractMigrateApplication implements Closeable { + + private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class); + + private final MdstoreClient mdstoreClient; + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json"))); + parser.parseArgument(args); + + final String mongoBaseUrl = parser.get("mongoBaseUrl"); + final String mongoDb = parser.get("mongoDb"); + + final String mdFormat = parser.get("mdFormat"); + final String mdLayout = parser.get("mdLayout"); + final String mdInterpretation = parser.get("mdInterpretation"); + + final String hdfsPath = parser.get("hdfsPath"); + final String hdfsNameNode = parser.get("namenode"); + final String hdfsUser = parser.get("hdfsUser"); + + try (final MigrateMongoMdstoresApplication mig = new MigrateMongoMdstoresApplication(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb)) { + mig.processMdRecords(mdFormat, mdLayout, mdInterpretation); + } + + } + + public MigrateMongoMdstoresApplication(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, + final String mongoDb) throws Exception { + super(hdfsPath, hdfsNameNode, hdfsUser); + this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb); + + } + + public void processMdRecords(final String mdFormat, final String mdLayout, final String mdInterpretation) throws DocumentException { + + for (final Entry entry : mdstoreClient.validCollections(mdFormat, mdLayout, mdInterpretation).entrySet()) { + // final String mdId = entry.getKey(); + final String currentColl = entry.getValue(); + + for (final String xml : mdstoreClient.listRecords(currentColl)) { + for (final Oaf oaf : createOafs(xml)) { + emitOaf(oaf); + } + } + } + } + + private List createOafs(final String xml) throws DocumentException { + final SAXReader reader = new SAXReader(); + final Document doc = reader.read(new StringReader(xml)); + + final String type = doc.valueOf(""); // TODO + + final List oafs = new ArrayList<>(); + + switch (type.toLowerCase()) { + case "publication": + final Publication p = new Publication(); + populateResultFields(p, doc); + p.setJournal(null); // TODO + oafs.add(p); + break; + case "dataset": + final Dataset d = new Dataset(); + populateResultFields(d, doc); + d.setStoragedate(null); // TODO + d.setDevice(null); // TODO + d.setSize(null); // TODO + d.setVersion(null); // TODO + d.setLastmetadataupdate(null); // TODO + d.setMetadataversionnumber(null); // TODO + d.setGeolocation(null); // TODO + oafs.add(d); + break; + case "otherresearchproducts": + final OtherResearchProduct o = new OtherResearchProduct(); + populateResultFields(o, doc); + o.setContactperson(null); // TODO + o.setContactgroup(null); // TODO + o.setTool(null); // TODO + oafs.add(o); + break; + case "software": + final Software s = new Software(); + populateResultFields(s, doc); + s.setDocumentationUrl(null); // TODO + s.setLicense(null); // TODO + s.setCodeRepositoryUrl(null); // TODO + s.setProgrammingLanguage(null); // TODO + oafs.add(s); + break; + default: + log.error("Inavlid type: " + type); + break; + } + + if (!oafs.isEmpty()) { + addRelations(oafs, doc, "//*", "TYPE"); + addRelations(oafs, doc, "//*", "TYPE"); + addRelations(oafs, doc, "//*", "TYPE"); + } + + return oafs; + } + + private void addRelations(final List oafs, final Document doc, final String xpath, final String type) { + for (final Object o : doc.selectNodes(xpath)) { + final Node n = (Node) o; + final Relation r = new Relation(); + r.setRelType(null); // TODO + r.setSubRelType(null); // TODO + r.setRelClass(null); // TODO + r.setSource(null); // TODO + r.setTarget(null); // TODO + r.setCollectedFrom(null); // TODO + oafs.add(r); + } + + } + + private void populateResultFields(final Result r, final Document doc) { + r.setDataInfo(null); // TODO + r.setLastupdatetimestamp(null); // TODO + r.setId(null); // TODO + r.setOriginalId(null); // TODO + r.setCollectedfrom(null); // TODO + r.setPid(null); // TODO + r.setDateofcollection(null); // TODO + r.setDateoftransformation(null); // TODO + r.setExtraInfo(null); // TODO + r.setOaiprovenance(null); // TODO + r.setAuthor(null); // TODO + r.setResulttype(null); // TODO + r.setLanguage(null); // TODO + r.setCountry(null); // TODO + r.setSubject(null); // TODO + r.setTitle(null); // TODO + r.setRelevantdate(null); // TODO + r.setDescription(null); // TODO + r.setDateofacceptance(null); // TODO + r.setPublisher(null); // TODO + r.setEmbargoenddate(null); // TODO + r.setSource(null); // TODO + r.setFulltext(null); // TODO + r.setFormat(null); // TODO + r.setContributor(null); // TODO + r.setResourcetype(null); // TODO + r.setCoverage(null); // TODO + r.setRefereed(null); // TODO + r.setContext(null); // TODO + r.setExternalReference(null); // TODO + r.setInstance(null); // TODO + r.setProcessingchargeamount(null); // TODO + r.setProcessingchargecurrency(null); // TODO + } + + @Override + public void close() throws IOException { + super.close(); + mdstoreClient.close(); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrationUtils.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrationUtils.java new file mode 100644 index 000000000..8346a8041 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrationUtils.java @@ -0,0 +1,164 @@ +package eu.dnetlib.dhp.migration; + +import java.sql.Array; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; + +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.ExtraInfo; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.Journal; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.OAIProvenance; +import eu.dnetlib.dhp.schema.oaf.OriginDescription; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.utils.DHPUtils; + +public class MigrationUtils { + + public static KeyValue keyValue(final String k, final String v) { + final KeyValue kv = new KeyValue(); + kv.setKey(k); + kv.setValue(v); + return kv; + } + + public static List listKeyValues(final String... s) { + if (s.length % 2 > 0) { throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)"); } + + final List list = new ArrayList<>(); + for (int i = 0; i < s.length; i += 2) { + list.add(keyValue(s[i], s[i + 1])); + } + return list; + } + + public static Field field(final T value, final DataInfo info) { + final Field field = new Field<>(); + field.setValue(value); + field.setDataInfo(info); + return field; + } + + public static List> listFields(final DataInfo info, final String... values) { + return Arrays.stream(values).map(v -> field(v, info)).collect(Collectors.toList()); + } + + public static List> listFields(final DataInfo info, final Array array) { + try { + return listFields(info, (String[]) array.getArray()); + } catch (final SQLException e) { + throw new RuntimeException("Invalid SQL array", e); + } + } + + public static Qualifier qualifier(final String classid, final String classname, final String schemeid, final String schemename) { + final Qualifier q = new Qualifier(); + q.setClassid(classid); + q.setClassname(classname); + q.setSchemeid(schemeid); + q.setSchemename(schemename); + return q; + } + + public static StructuredProperty structuredProperty(final String value, + final String classid, + final String classname, + final String schemeid, + final String schemename, + final DataInfo dataInfo) { + final StructuredProperty sp = new StructuredProperty(); + sp.setValue(value); + sp.setQualifier(qualifier(classid, classname, schemeid, schemename)); + sp.setDataInfo(dataInfo); + return sp; + } + + public static ExtraInfo extraInfo(final String name, final String value, final String typology, final String provenance, final String trust) { + final ExtraInfo info = new ExtraInfo(); + info.setName(name); + info.setValue(value); + info.setTypology(typology); + info.setProvenance(provenance); + info.setTrust(trust); + return info; + } + + public static OAIProvenance oaiIProvenance(final String identifier, + final String baseURL, + final String metadataNamespace, + final Boolean altered, + final String datestamp, + final String harvestDate) { + + final OriginDescription desc = new OriginDescription(); + desc.setIdentifier(identifier); + desc.setBaseURL(baseURL); + desc.setMetadataNamespace(metadataNamespace); + desc.setAltered(altered); + desc.setDatestamp(datestamp); + desc.setHarvestDate(harvestDate); + + final OAIProvenance p = new OAIProvenance(); + p.setOriginDescription(desc); + + return p; + } + + public static Journal journal(final String name, + final String issnPrinted, + final String issnOnline, + final String issnLinking, + final String ep, + final String iss, + final String sp, + final String vol, + final String edition, + final String conferenceplace, + final String conferencedate, + final DataInfo dataInfo) { + final Journal j = new Journal(); + j.setName(name); + j.setIssnPrinted(issnPrinted); + j.setIssnOnline(issnOnline); + j.setIssnLinking(issnLinking); + j.setEp(ep); + j.setIss(iss); + j.setSp(sp); + j.setVol(vol); + j.setEdition(edition); + j.setConferenceplace(conferenceplace); + j.setConferencedate(conferencedate); + j.setDataInfo(dataInfo); + return j; + } + + public static DataInfo dataInfo(final Boolean deletedbyinference, + final String inferenceprovenance, + final Boolean inferred, + final Boolean invisible, + final Qualifier provenanceaction, + final String trust) { + final DataInfo d = new DataInfo(); + d.setDeletedbyinference(deletedbyinference); + d.setInferenceprovenance(inferenceprovenance); + d.setInferred(inferred); + d.setInvisible(invisible); + d.setProvenanceaction(provenanceaction); + d.setTrust(trust); + return d; + } + + public static String createOpenaireId(final String prefix, final String originalId) { + final String nsPrefix = StringUtils.substringBefore(originalId, "::"); + final String rest = StringUtils.substringAfter(originalId, "::"); + return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest)); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json new file mode 100644 index 000000000..861d297ba --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json @@ -0,0 +1,38 @@ +[ + { + "paramName": "p", + "paramLongName": "hdfsPath", + "paramDescription": "the path where storing the sequential file", + "paramRequired": true + }, + { + "paramName": "n", + "paramLongName": "namenode", + "paramDescription": "the Name Node URI", + "paramRequired": true + }, + { + "paramName": "u", + "paramLongName": "hdfsUser", + "paramDescription": "the user wich create the hdfs seq file", + "paramRequired": true + }, + { + "paramName": "dburl", + "paramLongName": "postgresUrl", + "paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb", + "paramRequired": true + }, + { + "paramName": "dbuser", + "paramLongName": "postgresUser", + "paramDescription": "postgres user", + "paramRequired": true + }, + { + "paramName": "dbpasswd", + "paramLongName": "postgresPassword", + "paramDescription": "postgres password", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json new file mode 100644 index 000000000..fb5736dc0 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json @@ -0,0 +1,50 @@ +[ + { + "paramName": "p", + "paramLongName": "hdfsPath", + "paramDescription": "the path where storing the sequential file", + "paramRequired": true + }, + { + "paramName": "n", + "paramLongName": "namenode", + "paramDescription": "the Name Node URI", + "paramRequired": true + }, + { + "paramName": "u", + "paramLongName": "hdfsUser", + "paramDescription": "the user wich create the hdfs seq file", + "paramRequired": true + }, + { + "paramName": "mongourl", + "paramLongName": "mongoBaseUrl", + "paramDescription": "mongoDB url, example: mongodb://[username:password@]host[:port]", + "paramRequired": true + }, + { + "paramName": "db", + "paramLongName": "mongoDb", + "paramDescription": "mongo database", + "paramRequired": true + }, + { + "paramName": "f", + "paramLongName": "mdFormat", + "paramDescription": "metadata format", + "paramRequired": true + }, + { + "paramName": "l", + "paramLongName": "mdLayout", + "paramDescription": "metadata layout", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "mdInterpretation", + "paramDescription": "metadata interpretation", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasourceOrganization.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasourceOrganization.sql new file mode 100644 index 000000000..885b6ae09 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasourceOrganization.sql @@ -0,0 +1,16 @@ +SELECT + dor.datasource AS datasource, + dor.organization AS organization, + NULL AS startdate, + NULL AS enddate, + false AS inferred, + false AS deletedbyinference, + 0.9 AS trust, + NULL AS inferenceprovenance, + + 'providedBy@@@provided by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies' AS semantics, + d.provenanceaction || '@@@' || d.provenanceaction || '@@@dnet:provenanceActions@@@dnet:provenanceActions' AS provenanceaction + +FROM dsm_datasource_organization dor + LEFT OUTER JOIN dsm_datasources d ON (dor.datasource = d.id) + diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasources.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasources.sql new file mode 100644 index 000000000..8c587f34e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasources.sql @@ -0,0 +1,147 @@ +SELECT + d.id AS datasourceid, + d.id || array_agg(distinct di.pid) AS identities, + d.officialname AS officialname, + d.englishname AS englishname, + d.contactemail AS contactemail, + CASE + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['openaire-cris_1.1']) + THEN + 'openaire-cris_1.1@@@OpenAIRE CRIS v1.1@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['driver', 'openaire2.0']) + THEN + 'driver-openaire2.0@@@OpenAIRE 2.0+ (DRIVER OA, EC funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['driver']) + THEN + 'driver@@@OpenAIRE Basic (DRIVER OA)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire2.0']) + THEN + 'openaire2.0@@@OpenAIRE 2.0 (EC funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire3.0']) + THEN + 'openaire3.0@@@OpenAIRE 3.0 (OA, funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire2.0_data']) + THEN + 'openaire2.0_data@@@OpenAIRE Data (funded, referenced datasets)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['native']) + THEN + 'native@@@proprietary@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['hostedBy']) + THEN + 'hostedBy@@@collected from a compatible aggregator@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['notCompatible']) + THEN + 'notCompatible@@@under validation@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + ELSE + 'UNKNOWN@@@not available@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + END AS openairecompatibility, + d.websiteurl AS websiteurl, + d.logourl AS logourl, + array_agg(DISTINCT CASE WHEN a.protocol = 'oai' and last_aggregation_date is not null THEN a.baseurl ELSE NULL END) AS accessinfopackage, + d.latitude AS latitude, + d.longitude AS longitude, + d.namespaceprefix AS namespaceprefix, + NULL AS odnumberofitems, + NULL AS odnumberofitemsdate, + + (SELECT array_agg(s|| '###keywords@@@keywords@@@dnet:subject_classification_typologies@@@dnet:subject_classification_typologies') + FROM UNNEST( + ARRAY( + SELECT trim(s) + FROM unnest(string_to_array(d.subjects, '@@')) AS s)) AS s) AS subjects, + + d.description AS description, + NULL AS odpolicies, + ARRAY(SELECT trim(s) + FROM unnest(string_to_array(d.languages, ',')) AS s) AS odlanguages, + ARRAY(SELECT trim(s) + FROM unnest(string_to_array(d.od_contenttypes, '-')) AS s) AS odcontenttypes, + false AS inferred, + false AS deletedbyinference, + 0.9 AS trust, + NULL AS inferenceprovenance, + d.dateofcollection AS dateofcollection, + d.dateofvalidation AS dateofvalidation, + -- re3data fields + d.releasestartdate AS releasestartdate, + d.releaseenddate AS releaseenddate, + d.missionstatementurl AS missionstatementurl, + d.dataprovider AS dataprovider, + d.serviceprovider AS serviceprovider, + d.databaseaccesstype AS databaseaccesstype, + d.datauploadtype AS datauploadtype, + d.databaseaccessrestriction AS databaseaccessrestriction, + d.datauploadrestriction AS datauploadrestriction, + d.versioning AS versioning, + d.citationguidelineurl AS citationguidelineurl, + d.qualitymanagementkind AS qualitymanagementkind, + d.pidsystems AS pidsystems, + d.certificates AS certificates, + ARRAY[]::text[] AS policies, + dc.id AS collectedfromid, + dc.officialname AS collectedfromname, + d.typology || '@@@' || CASE + WHEN (d.typology = 'crissystem') THEN 'CRIS System' + WHEN (d.typology = 'datarepository::unknown') THEN 'Data Repository' + WHEN (d.typology = 'aggregator::datarepository') THEN 'Data Repository Aggregator' + WHEN (d.typology = 'infospace') THEN 'Information Space' + WHEN (d.typology = 'pubsrepository::institutional') THEN 'Institutional Repository' + WHEN (d.typology = 'aggregator::pubsrepository::institutional') THEN 'Institutional Repository Aggregator' + WHEN (d.typology = 'pubsrepository::journal') THEN 'Journal' + WHEN (d.typology = 'aggregator::pubsrepository::journals') THEN 'Journal Aggregator/Publisher' + WHEN (d.typology = 'pubsrepository::mock') THEN 'Other' + WHEN (d.typology = 'pubscatalogue::unknown') THEN 'Publication Catalogue' + WHEN (d.typology = 'pubsrepository::unknown') THEN 'Publication Repository' + WHEN (d.typology = 'aggregator::pubsrepository::unknown') THEN 'Publication Repository Aggregator' + WHEN (d.typology = 'entityregistry') THEN 'Registry' + WHEN (d.typology = 'scholarcomminfra') THEN 'Scholarly Comm. Infrastructure' + WHEN (d.typology = 'pubsrepository::thematic') THEN 'Thematic Repository' + WHEN (d.typology = 'websource') THEN 'Web Source' + WHEN (d.typology = 'entityregistry::projects') THEN 'Funder database' + WHEN (d.typology = 'entityregistry::repositories') THEN 'Registry of repositories' + WHEN (d.typology = 'softwarerepository') THEN 'Software Repository' + WHEN (d.typology = 'aggregator::softwarerepository') THEN 'Software Repository Aggregator' + WHEN (d.typology = 'orprepository') THEN 'Repository' + ELSE 'Other' + END || '@@@dnet:datasource_typologies@@@dnet:datasource_typologies' AS datasourcetype, + 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction, + CONCAT(d.issn, '@@@', d.eissn, '@@@', d.lissn) AS journal + +FROM dsm_datasources d + +LEFT OUTER JOIN dsm_datasources dc on (d.collectedfrom = dc.id) +LEFT OUTER JOIN dsm_api a ON (d.id = a.datasource) +LEFT OUTER JOIN dsm_datasourcepids di ON (d.id = di.datasource) + +GROUP BY + d.id, + d.officialname, + d.englishname, + d.websiteurl, + d.logourl, + d.contactemail, + d.namespaceprefix, + d.description, + d.latitude, + d.longitude, + d.dateofcollection, + d.dateofvalidation, + d.releasestartdate, + d.releaseenddate, + d.missionstatementurl, + d.dataprovider, + d.serviceprovider, + d.databaseaccesstype, + d.datauploadtype, + d.databaseaccessrestriction, + d.datauploadrestriction, + d.versioning, + d.citationguidelineurl, + d.qualitymanagementkind, + d.pidsystems, + d.certificates, + dc.id, + dc.officialname, + d.issn, + d.eissn, + d.lissn diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizations.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizations.sql new file mode 100644 index 000000000..682ca3596 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizations.sql @@ -0,0 +1,36 @@ +SELECT + o.id AS organizationid, + o.legalshortname AS legalshortname, + o.legalname AS legalname, + o.websiteurl AS websiteurl, + o.logourl AS logourl, + o.ec_legalbody AS eclegalbody, + o.ec_legalperson AS eclegalperson, + o.ec_nonprofit AS ecnonprofit, + o.ec_researchorganization AS ecresearchorganization, + o.ec_highereducation AS echighereducation, + o.ec_internationalorganizationeurinterests AS ecinternationalorganizationeurinterests, + o.ec_internationalorganization AS ecinternationalorganization, + o.ec_enterprise AS ecenterprise, + o.ec_smevalidated AS ecsmevalidated, + o.ec_nutscode AS ecnutscode, + o.dateofcollection AS dateofcollection, + o.lastupdate AS dateoftransformation, + false AS inferred, + false AS deletedbyinference, + o.trust AS trust, + '' AS inferenceprovenance, + d.id AS collectedfromid, + d.officialname AS collectedfromname, + + o.country || '@@@dnet:countries' AS country, + 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction, + + ARRAY[]::text[] AS pid +FROM dsm_organizations o + LEFT OUTER JOIN dsm_datasources d ON (d.id = o.collectedfrom) + + + + + diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizationsFromOpenOrgsDB.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizationsFromOpenOrgsDB.sql new file mode 100644 index 000000000..dc9550883 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizationsFromOpenOrgsDB.sql @@ -0,0 +1,53 @@ +SELECT + o.id AS organizationid, + coalesce((array_agg(a.acronym))[1], o.name) AS legalshortname, + o.name AS legalname, + array_agg(DISTINCT n.name) AS "alternativeNames", + (array_agg(u.url))[1] AS websiteurl, + o.modification_date AS dateoftransformation, + false AS inferred, + false AS deletedbyinference, + 0.95 AS trust, + '' AS inferenceprovenance, + 'openaire____::openorgs' AS collectedfromid, + 'OpenOrgs Database' AS collectedfromname, + o.country || '@@@dnet:countries' AS country, + 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction, + array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid +FROM organizations o + LEFT OUTER JOIN acronyms a ON (a.id = o.id) + LEFT OUTER JOIN urls u ON (u.id = o.id) + LEFT OUTER JOIN other_ids i ON (i.id = o.id) + LEFT OUTER JOIN other_names n ON (n.id = o.id) +GROUP BY + o.id, + o.name, + o.modification_date, + o.country + +UNION ALL + +SELECT + 'openorgsmesh'||substring(o.id, 13)||'-'||md5(n.name) AS organizationid, + n.name AS legalshortname, + n.name AS legalname, + ARRAY[]::text[] AS "alternativeNames", + (array_agg(u.url))[1] AS websiteurl, + o.modification_date AS dateoftransformation, + false AS inferred, + false AS deletedbyinference, + 0.88 AS trust, + '' AS inferenceprovenance, + 'openaire____::openorgs' AS collectedfromid, + 'OpenOrgs Database' AS collectedfromname, + o.country || '@@@dnet:countries' AS country, + 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction, + array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid +FROM other_names n + LEFT OUTER JOIN organizations o ON (n.id = o.id) + LEFT OUTER JOIN urls u ON (u.id = o.id) + LEFT OUTER JOIN other_ids i ON (i.id = o.id) +GROUP BY + o.id, o.modification_date, o.country, n.name + + diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjectOrganization.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjectOrganization.sql new file mode 100644 index 000000000..4483d6145 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjectOrganization.sql @@ -0,0 +1,16 @@ +SELECT + po.project AS project, + po.resporganization AS resporganization, + po.participantnumber AS participantnumber, + po.contribution AS contribution, + NULL AS startdate, + NULL AS enddate, + false AS inferred, + false AS deletedbyinference, + po.trust AS trust, + NULL AS inferenceprovenance, + + po.semanticclass || '@@@' || po.semanticclass || '@@@dnet:project_organization_relations@@@dnet:project_organization_relations' AS semantics, + 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction + +FROM project_organization po diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects.sql new file mode 100644 index 000000000..f04f1f03b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects.sql @@ -0,0 +1,87 @@ +SELECT + p.id AS projectid, + p.code AS code, + p.websiteurl AS websiteurl, + p.acronym AS acronym, + p.title AS title, + p.startdate AS startdate, + p.enddate AS enddate, + p.call_identifier AS callidentifier, + p.keywords AS keywords, + p.duration AS duration, + p.ec_sc39 AS ecsc39, + p.oa_mandate_for_publications AS oamandatepublications, + p.ec_article29_3 AS ecarticle29_3, + p.dateofcollection AS dateofcollection, + p.lastupdate AS dateoftransformation, + p.inferred AS inferred, + p.deletedbyinference AS deletedbyinference, + p.trust AS trust, + p.inferenceprovenance AS inferenceprovenance, + p.optional1 AS optional1, + p.optional2 AS optional2, + p.jsonextrainfo AS jsonextrainfo, + p.contactfullname AS contactfullname, + p.contactfax AS contactfax, + p.contactphone AS contactphone, + p.contactemail AS contactemail, + p.summary AS summary, + p.currency AS currency, + p.totalcost AS totalcost, + p.fundedamount AS fundedamount, + dc.id AS collectedfromid, + dc.officialname AS collectedfromname, + p.contracttype || '@@@' || p.contracttypename || '@@@' || p.contracttypescheme || '@@@' || p.contracttypescheme AS contracttype, + pac.code || '@@@' || pac.name || '@@@' || pas.code || '@@@' || pas.name AS provenanceaction, + array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid, + array_agg(DISTINCT s.name || '###' || sc.code || '@@@' || sc.name || '@@@' || ss.code || '@@@' || ss.name) AS subjects, + array_agg(DISTINCT fp.path) AS fundingtree + FROM projects p + LEFT OUTER JOIN class pac ON (pac.code = p.provenanceactionclass) + LEFT OUTER JOIN scheme pas ON (pas.code = p.provenanceactionscheme) + + LEFT OUTER JOIN projectpids pp ON (pp.project = p.id) + LEFT OUTER JOIN dsm_identities i ON (i.pid = pp.pid) + + LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom) + + LEFT OUTER JOIN project_fundingpath pf ON (pf.project = p.id) + LEFT OUTER JOIN fundingpaths fp ON (fp.id = pf.funding) + + LEFT OUTER JOIN project_subject ps ON (ps.project = p.id) + LEFT OUTER JOIN subjects s ON (s.id = ps.subject) + + LEFT OUTER JOIN class sc ON (sc.code = s.semanticclass) + LEFT OUTER JOIN scheme ss ON (ss.code = s.semanticscheme) + + GROUP BY + p.id, + p.code, + p.websiteurl, + p.acronym, + p.title, + p.startdate, + p.enddate, + p.call_identifier, + p.keywords, + p.duration, + p.ec_sc39, + p.oa_mandate_for_publications, + p.ec_article29_3, + p.dateofcollection, + p.inferred, + p.deletedbyinference, + p.trust, + p.inferenceprovenance, + p.contactfullname, + p.contactfax, + p.contactphone, + p.contactemail, + p.contracttype, + p.summary, + p.currency, + p.totalcost, + p.fundedamount, + dc.id, + dc.officialname, + pac.code, pac.name, pas.code, pas.name; \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/querySimilarityFromOpenOrgsDB.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/querySimilarityFromOpenOrgsDB.sql new file mode 100644 index 000000000..4407559c6 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/querySimilarityFromOpenOrgsDB.sql @@ -0,0 +1,17 @@ +SELECT local_id AS id1, oa_original_id AS id2 FROM openaire_simrels WHERE reltype = 'is_similar' + +UNION ALL + +SELECT + o.id AS id1, + 'openorgsmesh'||substring(o.id, 13)||'-'||md5(a.acronym) AS id2 +FROM acronyms a + LEFT OUTER JOIN organizations o ON (a.id = o.id) + +UNION ALL + +SELECT + o.id AS id1, + 'openorgsmesh'||substring(o.id, 13)||'-'||md5(n.name) AS id2 +FROM other_names n + LEFT OUTER JOIN organizations o ON (n.id = o.id) From fa7504bf29b43c5ca291e1b0044fe15ecb27cd42 Mon Sep 17 00:00:00 2001 From: "sandro.labruzzo" Date: Mon, 20 Jan 2020 10:28:00 +0100 Subject: [PATCH 07/45] removed DLI stuff should be in a branch --- .../eu/dnetlib/dhp/schema/dli/Entity.java | 118 ------------------ .../java/eu/dnetlib/dhp/schema/dli/Pid.java | 33 ----- .../eu/dnetlib/dhp/schema/dli/Provenance.java | 35 ------ .../eu/dnetlib/dhp/schema/dli/Relation.java | 47 ------- .../dhp/schema/dli/RelationSemantic.java | 16 --- .../eu/dnetlib/dhp/schema/dli/Subject.java | 35 ------ 6 files changed, 284 deletions(-) delete mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Entity.java delete mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Pid.java delete mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Provenance.java delete mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Relation.java delete mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/RelationSemantic.java delete mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Subject.java diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Entity.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Entity.java deleted file mode 100644 index 894d54eaf..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Entity.java +++ /dev/null @@ -1,118 +0,0 @@ -package eu.dnetlib.dhp.schema.dli; - -import java.io.Serializable; -import java.util.List; - -public class Entity implements Serializable { - - private String identifier; - - private List pid; - - private List title; - - private List date; - - private String typology; - - private List authors; - - private List subject; - - private String description; - - private String completionStatus; - - private List collectedFrom; - - private List publisher; - - - public String getIdentifier() { - return identifier; - } - - public void setIdentifier(String identifier) { - this.identifier = identifier; - } - - public List getPid() { - return pid; - } - - public void setPid(List pid) { - this.pid = pid; - } - - public List getTitle() { - return title; - } - - public void setTitle(List title) { - this.title = title; - } - - public List getDate() { - return date; - } - - public void setDate(List date) { - this.date = date; - } - - public String getTypology() { - return typology; - } - - public void setTypology(String typology) { - this.typology = typology; - } - - public List getAuthors() { - return authors; - } - - public void setAuthors(List authors) { - this.authors = authors; - } - - public List getSubject() { - return subject; - } - - public void setSubject(List subject) { - this.subject = subject; - } - - public String getDescription() { - return description; - } - - public void setDescription(String description) { - this.description = description; - } - - public List getCollectedFrom() { - return collectedFrom; - } - - public void setCollectedFrom(List collectedFrom) { - this.collectedFrom = collectedFrom; - } - - public List getPublisher() { - return publisher; - } - - public void setPublisher(List publisher) { - this.publisher = publisher; - } - - public String getCompletionStatus() { - return completionStatus; - } - - public void setCompletionStatus(String completionStatus) { - this.completionStatus = completionStatus; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Pid.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Pid.java deleted file mode 100644 index 252245f45..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Pid.java +++ /dev/null @@ -1,33 +0,0 @@ -package eu.dnetlib.dhp.schema.dli; - -import eu.dnetlib.dhp.utils.DHPUtils; -import org.apache.commons.lang3.StringUtils; - -public class Pid { - - private String pid; - - private String pidType; - - public String getPid() { - return pid; - } - - public void setPid(String pid) { - this.pid = pid; - } - - public String getPidType() { - return pidType; - } - - public void setPidType(String pidType) { - this.pidType = pidType; - } - - public String generateId() { - if(StringUtils.isEmpty(pid) || StringUtils.isEmpty(pidType)) - return null; - return DHPUtils.md5(String.format("%s::%s", pid, pidType)); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Provenance.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Provenance.java deleted file mode 100644 index 300b1134b..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Provenance.java +++ /dev/null @@ -1,35 +0,0 @@ -package eu.dnetlib.dhp.schema.dli; - -public class Provenance { - - private String datasourceId; - - private String datasourceName; - - private String completionStatus; - - - public String getDatasourceId() { - return datasourceId; - } - - public void setDatasourceId(String datasourceId) { - this.datasourceId = datasourceId; - } - - public String getDatasourceName() { - return datasourceName; - } - - public void setDatasourceName(String datasourceName) { - this.datasourceName = datasourceName; - } - - public String getCompletionStatus() { - return completionStatus; - } - - public void setCompletionStatus(String completionStatus) { - this.completionStatus = completionStatus; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Relation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Relation.java deleted file mode 100644 index b83cccb73..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Relation.java +++ /dev/null @@ -1,47 +0,0 @@ -package eu.dnetlib.dhp.schema.dli; - -import java.io.Serializable; -import java.util.List; - -public class Relation implements Serializable { - - private String source; - - private String target; - - private List provenance; - - private RelationSemantic semantic; - - public String getSource() { - return source; - } - - public void setSource(String source) { - this.source = source; - } - - public String getTarget() { - return target; - } - - public void setTarget(String target) { - this.target = target; - } - - public List getProvenance() { - return provenance; - } - - public void setProvenance(List provenance) { - this.provenance = provenance; - } - - public RelationSemantic getSemantic() { - return semantic; - } - - public void setSemantic(RelationSemantic semantic) { - this.semantic = semantic; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/RelationSemantic.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/RelationSemantic.java deleted file mode 100644 index ff871ef2d..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/RelationSemantic.java +++ /dev/null @@ -1,16 +0,0 @@ -package eu.dnetlib.dhp.schema.dli; - -import java.io.Serializable; - -public class RelationSemantic extends Subject implements Serializable { - - public String inverse; - - public String getInverse() { - return inverse; - } - - public void setInverse(String inverse) { - this.inverse = inverse; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Subject.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Subject.java deleted file mode 100644 index bd89bc6dd..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Subject.java +++ /dev/null @@ -1,35 +0,0 @@ -package eu.dnetlib.dhp.schema.dli; - -import java.io.Serializable; - -public class Subject implements Serializable { - - private String schema; - - private String value; - - public Subject() { - - } - - public Subject(String schema, String value) { - this.schema = schema; - this.value = value; - } - - public String getSchema() { - return schema; - } - - public void setSchema(String schema) { - this.schema = schema; - } - - public String getValue() { - return value; - } - - public void setValue(String value) { - this.value = value; - } -} From b35c59eb42465d6e7dd9981e7f745d4a7eeb5dcb Mon Sep 17 00:00:00 2001 From: Michele Artini Date: Mon, 20 Jan 2020 16:04:19 +0100 Subject: [PATCH 08/45] partial implementation of entities from db --- .../dhp-build-properties-maven-plugin/pom.xml | 35 + .../eu/dnetlib/dhp/schema/dli/Relation.java | 61 +- .../MigrateDbEntitiesApplication.java | 255 ++++-- dhp-workflows/dhp-dedup/pom.xml | 31 + .../dnetlib/dhp/graph/GraphMappingUtils.java | 36 +- .../dhp/graph/SparkGraphImporterJob.java | 63 +- pom.xml | 759 +++++++++--------- 7 files changed, 702 insertions(+), 538 deletions(-) diff --git a/dhp-build/dhp-build-properties-maven-plugin/pom.xml b/dhp-build/dhp-build-properties-maven-plugin/pom.xml index 4f99d5298..7b50acd3d 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/pom.xml +++ b/dhp-build/dhp-build-properties-maven-plugin/pom.xml @@ -76,6 +76,41 @@ + + + + + org.eclipse.m2e + lifecycle-mapping + 1.0.0 + + + + + + + org.apache.maven.plugins + + + maven-plugin-plugin + + + [3.2,) + + + descriptor + + + + + + + + + + + + diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Relation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Relation.java index b83cccb73..66007e21d 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Relation.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Relation.java @@ -5,43 +5,48 @@ import java.util.List; public class Relation implements Serializable { - private String source; + /** + * + */ + private static final long serialVersionUID = -9103706796710618813L; - private String target; + private String source; - private List provenance; + private String target; - private RelationSemantic semantic; + private List provenance; - public String getSource() { - return source; - } + private RelationSemantic semantic; - public void setSource(String source) { - this.source = source; - } + public String getSource() { + return source; + } - public String getTarget() { - return target; - } + public void setSource(final String source) { + this.source = source; + } - public void setTarget(String target) { - this.target = target; - } + public String getTarget() { + return target; + } - public List getProvenance() { - return provenance; - } + public void setTarget(final String target) { + this.target = target; + } - public void setProvenance(List provenance) { - this.provenance = provenance; - } + public List getProvenance() { + return provenance; + } - public RelationSemantic getSemantic() { - return semantic; - } + public void setProvenance(final List provenance) { + this.provenance = provenance; + } - public void setSemantic(RelationSemantic semantic) { - this.semantic = semantic; - } + public RelationSemantic getSemantic() { + return semantic; + } + + public void setSemantic(final RelationSemantic semantic) { + this.semantic = semantic; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java index 60a7c24f7..efc395812 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java @@ -2,11 +2,17 @@ package eu.dnetlib.dhp.migration; import java.io.Closeable; import java.io.IOException; +import java.sql.Array; import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Date; +import java.util.List; import java.util.function.Consumer; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -15,14 +21,21 @@ import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Datasource; import eu.dnetlib.dhp.schema.oaf.Organization; import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; public class MigrateDbEntitiesApplication extends AbstractMigrateApplication implements Closeable { + private static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION = MigrationUtils + .qualifier("sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", "dnet:provenance_actions", "dnet:provenance_actions"); + private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class); private final DbClient dbClient; + private final long lastUpdateTimestamp; + public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils.toString(MigrateDbEntitiesApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json"))); @@ -51,6 +64,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp final String dbPassword) throws Exception { super(hdfsPath, hdfsNameNode, hdfsUser); this.dbClient = new DbClient(dbUrl, dbUser, dbPassword); + this.lastUpdateTimestamp = new Date().getTime(); } public void execute(final String sqlFile, final Consumer consumer) throws Exception { @@ -61,7 +75,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp public void processDatasource(final ResultSet rs) { try { - final DataInfo info = MigrationUtils.dataInfo(null, null, null, null, null, null); // TODO + final DataInfo info = prepareDataInfo(rs); final Datasource ds = new Datasource(); @@ -74,8 +88,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp ds.setExtraInfo(null); // TODO ds.setOaiprovenance(null); // TODO - ds.setDatasourcetype(null); // Qualifier datasourcetype) { - ds.setOpenairecompatibility(null); // Qualifier openairecompatibility) { + ds.setDatasourcetype(prepareQualifierSplitting(rs.getString("datasourcetype"))); + ds.setOpenairecompatibility(prepareQualifierSplitting(rs.getString("openairecompatibility"))); ds.setOfficialname(MigrationUtils.field(rs.getString("officialname"), info)); ds.setEnglishname(MigrationUtils.field(rs.getString("englishname"), info)); ds.setWebsiteurl(MigrationUtils.field(rs.getString("websiteurl"), info)); @@ -86,7 +100,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp ds.setLongitude(MigrationUtils.field(Double.toString(rs.getDouble("longitude")), info)); ds.setDateofvalidation(MigrationUtils.field(rs.getDate("dateofvalidation").toString(), info)); ds.setDescription(MigrationUtils.field(rs.getString("description"), info)); - ds.setSubjects(null); // List subjects) { + ds.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info)); ds.setOdnumberofitems(MigrationUtils.field(Double.toString(rs.getInt("odnumberofitems")), info)); ds.setOdnumberofitemsdate(MigrationUtils.field(rs.getDate("odnumberofitemsdate").toString(), info)); ds.setOdpolicies(MigrationUtils.field(rs.getString("odpolicies"), info)); @@ -110,12 +124,15 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp ds.setPolicies(null); // List // TODO ds.setJournal(null); // Journal // TODO + ds.setDataInfo(info); + ds.setLastupdatetimestamp(lastUpdateTimestamp); + // rs.getString("datasourceid"); rs.getArray("identities"); // rs.getString("officialname"); // rs.getString("englishname"); // rs.getString("contactemail"); - rs.getString("openairecompatibility"); // COMPLEX ...@@@... + // rs.getString("openairecompatibility"); // COMPLEX ...@@@... // rs.getString("websiteurl"); // rs.getString("logourl"); // rs.getArray("accessinfopackage"); @@ -124,15 +141,15 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp // rs.getString("namespaceprefix"); // rs.getInt("odnumberofitems"); // NULL // rs.getDate("odnumberofitemsdate"); // NULL - rs.getArray("subjects"); + // rs.getArray("subjects"); // rs.getString("description"); // rs.getString("odpolicies"); // NULL // rs.getArray("odlanguages"); // rs.getArray("odcontenttypes"); - rs.getBoolean("inferred"); // false - rs.getBoolean("deletedbyinference");// false - rs.getDouble("trust"); // 0.9 - rs.getString("inferenceprovenance"); // NULL + // rs.getBoolean("inferred"); // false + // rs.getBoolean("deletedbyinference");// false + // rs.getDouble("trust"); // 0.9 + // rs.getString("inferenceprovenance"); // NULL // rs.getDate("dateofcollection"); // rs.getDate("dateofvalidation"); // rs.getDate("releasestartdate"); @@ -152,21 +169,22 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp rs.getArray("policies"); // rs.getString("collectedfromid"); // rs.getString("collectedfromname"); - rs.getString("datasourcetype"); // COMPLEX XXX@@@@.... - rs.getString("provenanceaction"); // 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' - // AS provenanceaction, + // rs.getString("datasourcetype"); // COMPLEX XXX@@@@.... + // rs.getString("provenanceaction"); // + // 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' + // AS provenanceaction, rs.getString("journal"); // CONCAT(d.issn, '@@@', d.eissn, '@@@', d.lissn) AS journal emitOaf(ds); } catch (final Exception e) { - // TODO: handle exception + throw new RuntimeException(e); } } public void processProject(final ResultSet rs) { try { - final DataInfo info = MigrationUtils.dataInfo(null, null, null, null, null, null); // TODO + final DataInfo info = prepareDataInfo(rs); final Project p = new Project(); @@ -192,9 +210,9 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp p.setEcsc39(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecsc39")), info)); p.setOamandatepublications(MigrationUtils.field(Boolean.toString(rs.getBoolean("oamandatepublications")), info)); p.setEcarticle29_3(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecarticle29_3")), info)); - p.setSubjects(null); // List //TODO + p.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info)); p.setFundingtree(null); // List> //TODO - p.setContracttype(null); // Qualifier //TODO + p.setContracttype(prepareQualifierSplitting(rs.getString("contracttype"))); p.setOptional1(MigrationUtils.field(rs.getString("optional1"), info)); p.setOptional2(MigrationUtils.field(rs.getString("optional2"), info)); p.setJsonextrainfo(MigrationUtils.field(rs.getString("jsonextrainfo"), info)); @@ -207,6 +225,9 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp p.setTotalcost(new Float(rs.getDouble("totalcost"))); p.setFundedamount(new Float(rs.getDouble("fundedamount"))); + p.setDataInfo(info); + p.setLastupdatetimestamp(lastUpdateTimestamp); + // rs.getString("projectid"); // rs.getString("code"); // rs.getString("websiteurl"); @@ -222,13 +243,13 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp // rs.getBoolean("ecarticle29_3"); // rs.getDate("dateofcollection"); // rs.getDate("dateoftransformation"); - rs.getBoolean("inferred"); - rs.getBoolean("deletedbyinference"); - rs.getDouble("trust"); - rs.getString("inferenceprovenance"); + // rs.getBoolean("inferred"); + // rs.getBoolean("deletedbyinference"); + // rs.getDouble("trust"); + // rs.getString("inferenceprovenance"); // rs.getString("optional1"); // rs.getString("optional2"); - rs.getString("jsonextrainfo"); + // rs.getString("jsonextrainfo"); // rs.getString("contactfullname"); // rs.getString("contactfax"); // rs.getString("contactphone"); @@ -248,14 +269,14 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp emitOaf(p); } catch (final Exception e) { - // TODO: handle exception + throw new RuntimeException(e); } } public void processOrganization(final ResultSet rs) { try { - final DataInfo info = MigrationUtils.dataInfo(null, null, null, null, null, null); // TODO + final DataInfo info = prepareDataInfo(rs); final Organization o = new Organization(); @@ -269,7 +290,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp o.setOaiprovenance(null); // OAIProvenance // TODO o.setLegalshortname(MigrationUtils.field("legalshortname", info)); o.setLegalname(MigrationUtils.field("legalname", info)); - o.setAlternativeNames(null); // List> //TODO + o.setAlternativeNames(new ArrayList<>()); o.setWebsiteurl(MigrationUtils.field("websiteurl", info)); o.setLogourl(MigrationUtils.field("logourl", info)); o.setEclegalbody(MigrationUtils.field(Boolean.toString(rs.getBoolean("eclegalbody")), info)); @@ -283,7 +304,10 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp o.setEcenterprise(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecenterprise")), info)); o.setEcsmevalidated(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info)); o.setEcnutscode(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecnutscode")), info)); - o.setCountry(null); // Qualifier country) { + o.setCountry(prepareQualifierSplitting(rs.getString("country"))); + + o.setDataInfo(info); + o.setLastupdatetimestamp(lastUpdateTimestamp); // rs.getString("organizationid"); // rs.getString("legalshortname"); @@ -300,87 +324,160 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp // rs.getBoolean("ecenterprise"); // rs.getBoolean("ecsmevalidated"); // rs.getBoolean("ecnutscode"); - rs.getDate("dateofcollection"); - rs.getDate("dateoftransformation"); - rs.getBoolean("inferred"); - rs.getBoolean("deletedbyinference"); - rs.getDouble("trust"); - rs.getString("inferenceprovenance"); + // rs.getDate("dateofcollection"); + // rs.getDate("dateoftransformation"); + // rs.getBoolean("inferred"); + // rs.getBoolean("deletedbyinference"); + // rs.getDouble("trust"); + // rs.getString("inferenceprovenance"); // rs.getString("collectedfromid"); // rs.getString("collectedfromname"); - rs.getString("country"); + // rs.getString("country"); rs.getString("provenanceaction"); rs.getArray("pid"); emitOaf(o); } catch (final Exception e) { - // TODO: handle exception + throw new RuntimeException(e); } } public void processDatasourceOrganization(final ResultSet rs) { try { - final Relation r = new Relation(); + final DataInfo info = prepareDataInfo(rs); + final String orgId = MigrationUtils.createOpenaireId("20", rs.getString("organization")); + final String dsId = MigrationUtils.createOpenaireId("10", rs.getString("datasource")); - r.setRelType(null); // TODO - r.setSubRelType(null); // TODO - r.setRelClass(null); // TODO - r.setSource(null); // TODO - r.setTarget(null); // TODO - r.setCollectedFrom(MigrationUtils.listKeyValues("", "")); + final Relation r1 = new Relation(); + r1.setRelType("datasourceOrganization"); + r1.setSubRelType("provision"); + r1.setRelClass("isProvidedBy"); + r1.setSource(dsId); + r1.setTarget(orgId); + r1.setCollectedFrom(null);// TODO + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + emitOaf(r1); - rs.getString("datasource"); - rs.getString("organization"); - rs.getDate("startdate"); // NULL - rs.getDate("enddate"); // NULL - rs.getBoolean("inferred"); // false - rs.getBoolean("deletedbyinference"); // false - rs.getDouble("trust"); // 0.9 - rs.getString("inferenceprovenance"); // NULL - rs.getString("semantics"); // 'providedBy@@@provided - // by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies' AS - // semantics, - rs.getString("provenanceaction"); // d.provenanceaction || '@@@' || d.provenanceaction || - // '@@@dnet:provenanceActions@@@dnet:provenanceActions' AS provenanceaction + final Relation r2 = new Relation(); + r2.setRelType("datasourceOrganization"); + r2.setSubRelType("provision"); + r2.setRelClass("provides"); + r2.setSource(orgId); + r2.setTarget(dsId); + r2.setCollectedFrom(null); // TODO + r2.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + emitOaf(r2); + + // rs.getString("datasource"); + // rs.getString("organization"); + // rs.getDate("startdate"); // NULL + // rs.getDate("enddate"); // NULL + // rs.getBoolean("inferred"); // false + // rs.getBoolean("deletedbyinference"); // false + // rs.getDouble("trust"); // 0.9 + // rs.getString("inferenceprovenance"); // NULL + // rs.getString("semantics"); // 'providedBy@@@provided + // by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies' AS + // semantics, + // rs.getString("provenanceaction"); // d.provenanceaction || '@@@' || d.provenanceaction || + // '@@@dnet:provenanceActions@@@dnet:provenanceActions' AS provenanceaction - emitOaf(r); } catch (final Exception e) { - // TODO: handle exception + throw new RuntimeException(e); } } public void processProjectOrganization(final ResultSet rs) { try { - final Relation r = new Relation(); + final DataInfo info = prepareDataInfo(rs); + final String orgId = MigrationUtils.createOpenaireId("20", rs.getString("resporganization")); + final String projectId = MigrationUtils.createOpenaireId("40", rs.getString("project")); - r.setRelType(null); // TODO - r.setSubRelType(null); // TODO - r.setRelClass(null); // TODO - r.setSource(null); // TODO - r.setTarget(null); // TODO - r.setCollectedFrom(null); + final Relation r1 = new Relation(); + r1.setRelType("projectOrganization"); + r1.setSubRelType("participation"); + r1.setRelClass("isParticipant"); + r1.setSource(projectId); + r1.setTarget(orgId); + r1.setCollectedFrom(null);// TODO + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + emitOaf(r1); + + final Relation r2 = new Relation(); + r2.setRelType("projectOrganization"); + r2.setSubRelType("participation"); + r2.setRelClass("hasParticipant"); + r2.setSource(orgId); + r2.setTarget(projectId); + r2.setCollectedFrom(null); // TODO + r2.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + emitOaf(r2); + + // rs.getString("project"); + // rs.getString("resporganization"); + // rs.getInt("participantnumber"); + // rs.getDouble("contribution"); + // rs.getDate("startdate");// null + // rs.getDate("enddate");// null + // rs.getBoolean("inferred");// false + // rs.getBoolean("deletedbyinference"); // false + // rs.getDouble("trust"); + // rs.getString("inferenceprovenance"); // NULL + // rs.getString("semantics"); // po.semanticclass || '@@@' || po.semanticclass || + // '@@@dnet:project_organization_relations@@@dnet:project_organization_relations' AS semantics, + // rs.getString("provenanceaction"); // + // 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' + // AS provenanceaction - rs.getString("project"); - rs.getString("resporganization"); - rs.getInt("participantnumber"); - rs.getDouble("contribution"); - rs.getDate("startdate");// null - rs.getDate("enddate");// null - rs.getBoolean("inferred");// false - rs.getBoolean("deletedbyinference"); // false - rs.getDouble("trust"); - rs.getString("inferenceprovenance"); // NULL - rs.getString("semantics"); // po.semanticclass || '@@@' || po.semanticclass || - // '@@@dnet:project_organization_relations@@@dnet:project_organization_relations' AS semantics, - rs.getString("provenanceaction"); // 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' - // AS provenanceaction - emitOaf(r); } catch (final Exception e) { - // TODO: handle exception + throw new RuntimeException(e); } } + private DataInfo prepareDataInfo(final ResultSet rs) throws SQLException { + final Boolean deletedbyinference = rs.getBoolean("deletedbyinference"); + final String inferenceprovenance = rs.getString("inferenceprovenance"); + final Boolean inferred = rs.getBoolean("inferred"); + final String trust = rs.getString("trust"); + return MigrationUtils.dataInfo(deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, trust); + } + + private Qualifier prepareQualifierSplitting(final String s) { + if (StringUtils.isBlank(s)) { return null; } + final String[] arr = s.split("@@@"); + return arr.length == 4 ? MigrationUtils.qualifier(arr[0], arr[1], arr[2], arr[3]) : null; + } + + private StructuredProperty prepareStructProp(final String s, final DataInfo dataInfo) { + if (StringUtils.isBlank(s)) { return null; } + final String[] parts = s.split("###"); + if (parts.length == 2) { + final String value = parts[0]; + final String[] arr = parts[1].split("@@@"); + if (arr.length == 4) { return MigrationUtils.structuredProperty(value, arr[0], arr[1], arr[2], arr[3], dataInfo); } + } + return null; + } + + private List prepareListOfStructProps(final Array array, final DataInfo dataInfo) throws SQLException { + final List res = new ArrayList<>(); + if (array != null) { + for (final String s : (String[]) array.getArray()) { + final StructuredProperty sp = prepareStructProp(s, dataInfo); + if (sp != null) { + res.add(sp); + } + } + } + + return res; + } + @Override public void close() throws IOException { super.close(); diff --git a/dhp-workflows/dhp-dedup/pom.xml b/dhp-workflows/dhp-dedup/pom.xml index 28ef6a453..81ac94f01 100644 --- a/dhp-workflows/dhp-dedup/pom.xml +++ b/dhp-workflows/dhp-dedup/pom.xml @@ -9,6 +9,37 @@ 4.0.0 dhp-dedup + + + + + net.alchim31.maven + scala-maven-plugin + 4.0.1 + + + scala-compile-first + initialize + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + ${scala.version} + + + + + diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java index ab19ff2b5..0291be47e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java @@ -1,23 +1,31 @@ package eu.dnetlib.dhp.graph; -import com.google.common.collect.Maps; -import eu.dnetlib.dhp.schema.oaf.*; - import java.util.Map; +import com.google.common.collect.Maps; + +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Software; + public class GraphMappingUtils { - public final static Map types = Maps.newHashMap(); + public final static Map types = Maps.newHashMap(); - static { - types.put("datasource", Datasource.class); - types.put("organization", Organization.class); - types.put("project", Project.class); - types.put("dataset", Dataset.class); - types.put("otherresearchproduct", OtherResearchProduct.class); - types.put("software", Software.class); - types.put("publication", Publication.class); - types.put("relation", Relation.class); - } + static { + types.put("datasource", Datasource.class); + types.put("organization", Organization.class); + types.put("project", Project.class); + types.put("dataset", Dataset.class); + types.put("otherresearchproduct", OtherResearchProduct.class); + types.put("software", Software.class); + types.put("publication", Publication.class); + types.put("relation", Relation.class); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java index a6a4e9291..463bffae9 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java @@ -1,7 +1,5 @@ package eu.dnetlib.dhp.graph; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; import org.apache.spark.api.java.JavaRDD; @@ -9,42 +7,47 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; import scala.Tuple2; public class SparkGraphImporterJob { - public static void main(String[] args) throws Exception { + public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGraphImporterJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_graph_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkGraphImporterJob.class.getSimpleName()) - .master(parser.get("master")) - .config("hive.metastore.uris", parser.get("hive_metastore_uris")) - .enableHiveSupport() - .getOrCreate(); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils.toString(SparkGraphImporterJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_graph_parameters.json"))); + parser.parseArgument(args); + final SparkSession spark = SparkSession + .builder() + .appName(SparkGraphImporterJob.class.getSimpleName()) + .master(parser.get("master")) + .config("hive.metastore.uris", parser.get("hive_metastore_uris")) + .enableHiveSupport() + .getOrCreate(); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String inputPath = parser.get("sourcePath"); - final String hiveDbName = parser.get("hive_db_name"); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final String inputPath = parser.get("sourcePath"); + final String hiveDbName = parser.get("hive_db_name"); - spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", hiveDbName)); + spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", hiveDbName)); - // Read the input file and convert it into RDD of serializable object - GraphMappingUtils.types.forEach((name, clazz) -> { - final JavaRDD> inputRDD = sc.sequenceFile(inputPath + "/" + name, Text.class, Text.class) - .map(item -> new Tuple2<>(item._1.toString(), item._2.toString())); + // Read the input file and convert it into RDD of serializable object + GraphMappingUtils.types.forEach((name, clazz) -> { + final JavaRDD> inputRDD = sc.sequenceFile(inputPath + "/" + name, Text.class, Text.class) + .map(item -> new Tuple2<>(item._1.toString(), item._2.toString())); - spark.createDataset(inputRDD - .filter(s -> s._1().equals(clazz.getName())) - .map(Tuple2::_2) - .map(s -> new ObjectMapper().readValue(s, clazz)) - .rdd(), Encoders.bean(clazz)) - .write() - .mode(SaveMode.Overwrite) - .saveAsTable(hiveDbName + "." + name); - }); + spark.createDataset(inputRDD + .filter(s -> s._1().equals(clazz.getName())) + .map(Tuple2::_2) + .map(s -> new ObjectMapper().readValue(s, clazz)) + .rdd(), Encoders.bean(clazz)) + .write() + .mode(SaveMode.Overwrite) + .saveAsTable(hiveDbName + "." + name); + }); - } + } } diff --git a/pom.xml b/pom.xml index aedf5ebff..a27cf4fe7 100644 --- a/pom.xml +++ b/pom.xml @@ -1,426 +1,411 @@ - + - 4.0.0 - eu.dnetlib.dhp - dhp - 1.0.5-SNAPSHOT - pom + 4.0.0 + eu.dnetlib.dhp + dhp + 1.0.5-SNAPSHOT + pom - http://www.d-net.research-infrastructures.eu + http://www.d-net.research-infrastructures.eu - - - The Apache Software License, Version 2.0 - http://www.apache.org/licenses/LICENSE-2.0.txt - repo - A business-friendly OSS license - - + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + A business-friendly OSS license + + - - dhp-build - dhp-schemas - dhp-common - dhp-workflows - + + dhp-build + dhp-schemas + dhp-common + dhp-workflows + - - Redmine - https://issue.openaire.research-infrastructures.eu/projects/openaire - + + Redmine + https://issue.openaire.research-infrastructures.eu/projects/openaire + - - jenkins - https://jenkins-dnet.d4science.org/ - + + jenkins + https://jenkins-dnet.d4science.org/ + - - scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git - scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git - https://code-repo.d4science.org/D-Net/dnet-hadoop/ - HEAD - + + scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git + scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git + https://code-repo.d4science.org/D-Net/dnet-hadoop/ + HEAD + - - + + - - - dnet45-releases - D-Net 45 releases - http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases - default - - false - - - true - - - - cloudera - Cloudera Repository - https://repository.cloudera.com/artifactory/cloudera-repos - - true - - - false - - - + + + dnet45-releases + D-Net 45 releases + http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases + default + + false + + + true + + + + cloudera + Cloudera Repository + https://repository.cloudera.com/artifactory/cloudera-repos + + true + + + false + + + - - - junit - junit - 4.12 - test - + + + junit + junit + 4.12 + test + - - org.mockito - mockito-core - 2.7.22 - test - + + org.mockito + mockito-core + 2.7.22 + test + - + - - - - org.apache.hadoop - hadoop-hdfs - ${dhp.hadoop.version} - provided - - - org.apache.hadoop - hadoop-client - ${dhp.hadoop.version} - provided - - - org.apache.spark - spark-core_2.11 - ${dhp.spark.version} - provided - - - org.apache.spark - spark-sql_2.11 - ${dhp.spark.version} - provided - - - org.apache.spark - spark-graphx_2.11 - ${dhp.spark.version} - provided - + + + + org.apache.hadoop + hadoop-hdfs + ${dhp.hadoop.version} + provided + + + org.apache.hadoop + hadoop-client + ${dhp.hadoop.version} + provided + + + org.apache.spark + spark-core_2.11 + ${dhp.spark.version} + provided + + + org.apache.spark + spark-sql_2.11 + ${dhp.spark.version} + provided + + + org.apache.spark + spark-graphx_2.11 + ${dhp.spark.version} + provided + - - org.apache.commons - commons-lang3 - ${dhp.commons.lang.version} - + + org.apache.commons + commons-lang3 + ${dhp.commons.lang.version} + - - commons-codec - commons-codec - 1.9 - + + commons-codec + commons-codec + 1.9 + - - commons-io - commons-io - 2.4 - + + commons-io + commons-io + 2.4 + - - commons-cli - commons-cli - 1.2 - provided - + + commons-cli + commons-cli + 1.2 + provided + - - net.sf.saxon - Saxon-HE - 9.5.1-5 - + + net.sf.saxon + Saxon-HE + 9.5.1-5 + - - dom4j - dom4j - 1.6.1 - + + dom4j + dom4j + 1.6.1 + - - xml-apis - xml-apis - 1.4.01 - + + xml-apis + xml-apis + 1.4.01 + - - jaxen - jaxen - 1.1.6 - + + jaxen + jaxen + 1.1.6 + - - net.schmizz - sshj - 0.10.0 - test - + + net.schmizz + sshj + 0.10.0 + test + - - com.fasterxml.jackson.core - jackson-core - ${dhp.jackson.version} - provided - + + com.fasterxml.jackson.core + jackson-core + ${dhp.jackson.version} + provided + - - com.fasterxml.jackson.core - jackson-annotations - ${dhp.jackson.version} - provided - - - com.fasterxml.jackson.core - jackson-databind - ${dhp.jackson.version} - provided - + + com.fasterxml.jackson.core + jackson-annotations + ${dhp.jackson.version} + provided + + + com.fasterxml.jackson.core + jackson-databind + ${dhp.jackson.version} + provided + - - eu.dnetlib - dnet-pace-core - 4.0.0-SNAPSHOT - + + eu.dnetlib + dnet-pace-core + 4.0.0-SNAPSHOT + - - javax.persistence - javax.persistence-api - 2.2 - provided - + + javax.persistence + javax.persistence-api + 2.2 + provided + - - com.rabbitmq - amqp-client - 5.6.0 - - - com.jayway.jsonpath - json-path - 2.4.0 - - - com.arakelian - java-jq - 0.10.1 - - - edu.cmu - secondstring - 1.0.0 - + + com.rabbitmq + amqp-client + 5.6.0 + + + com.jayway.jsonpath + json-path + 2.4.0 + + + com.arakelian + java-jq + 0.10.1 + + + edu.cmu + secondstring + 1.0.0 + - - org.apache.oozie - oozie-client - ${dhp.oozie.version} - provided - - - - slf4j-simple - org.slf4j - - - - - + + org.mongodb + mongo-java-driver + ${mongodb.driver.version} + - - target - target/classes - ${project.artifactId}-${project.version} - target/test-classes - - - - org.apache.maven.plugins - maven-compiler-plugin - ${maven.compiler.plugin.version} - - 1.8 - 1.8 - ${project.build.sourceEncoding} - - + + org.apache.oozie + oozie-client + ${dhp.oozie.version} + provided + + + + slf4j-simple + org.slf4j + + + + + - - org.apache.maven.plugins - maven-jar-plugin - 3.0.2 - + + target + target/classes + ${project.artifactId}-${project.version} + target/test-classes + + + + org.apache.maven.plugins + maven-compiler-plugin + ${maven.compiler.plugin.version} + + 1.8 + 1.8 + ${project.build.sourceEncoding} + + - - org.apache.maven.plugins - maven-source-plugin - 3.0.1 - - - attach-sources - verify - - jar-no-fork - - - - + + org.apache.maven.plugins + maven-jar-plugin + 3.0.2 + - - org.apache.maven.plugins - maven-surefire-plugin - 2.19.1 - - true - - - - org.apache.maven.plugins - maven-javadoc-plugin - 2.10.4 - - true - - - - org.apache.maven.plugins - maven-dependency-plugin - 3.0.0 - + + org.apache.maven.plugins + maven-source-plugin + 3.0.1 + + + attach-sources + verify + + jar-no-fork + + + + - - org.codehaus.mojo - build-helper-maven-plugin - 1.12 - - - - - - org.apache.maven.plugins - maven-release-plugin - 2.5.3 - - - org.jacoco - jacoco-maven-plugin - 0.7.9 - - - **/schemas/* - **/com/cloudera/**/* - **/org/apache/avro/io/**/* - - - - - default-prepare-agent - - prepare-agent - - - - default-report - prepare-package - - report - - - - - - net.alchim31.maven - scala-maven-plugin - 4.0.1 - - - scala-compile-first - initialize - - add-source - compile - - - - scala-test-compile - process-test-resources - - testCompile - - - - - ${scala.version} - - - + + org.apache.maven.plugins + maven-surefire-plugin + 2.19.1 + + true + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.10.4 + + true + + + + org.apache.maven.plugins + maven-dependency-plugin + 3.0.0 + - - - org.apache.maven.wagon - wagon-ssh - 2.10 - - - - - - dnet45-snapshots - DNet45 Snapshots - http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots - default - - - dnet45-releases - http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - 2.10.4 - - true - - - - + + org.codehaus.mojo + build-helper-maven-plugin + 1.12 + + + + + + org.apache.maven.plugins + maven-release-plugin + 2.5.3 + + + org.jacoco + jacoco-maven-plugin + 0.7.9 + + + **/schemas/* + **/com/cloudera/**/* + **/org/apache/avro/io/**/* + + + + + default-prepare-agent + + prepare-agent + + + + default-report + prepare-package + + report + + + + - - UTF-8 - UTF-8 - 3.6.0 - 2.22.2 - cdh5.9.2 - 2.6.0-${dhp.cdh.version} - 4.1.0-${dhp.cdh.version} - 2.4.0.cloudera2 - 2.9.6 - 3.5 - 2.11.12 - + + + + + org.apache.maven.wagon + wagon-ssh + 2.10 + + + + + + dnet45-snapshots + DNet45 Snapshots + http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots + default + + + dnet45-releases + http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.10.4 + + true + + + + + + + UTF-8 + UTF-8 + 3.6.0 + 2.22.2 + cdh5.9.2 + 2.6.0-${dhp.cdh.version} + 4.1.0-${dhp.cdh.version} + 2.4.0.cloudera2 + 2.9.6 + 3.5 + 2.11.12 + 3.4.2 + From cd114f1c3b0cb36c045c16800dfd1251550463e5 Mon Sep 17 00:00:00 2001 From: Michele Artini Date: Tue, 21 Jan 2020 12:32:10 +0100 Subject: [PATCH 09/45] partial update --- .../MigrateDbEntitiesApplication.java | 85 ++++++++++++------- .../dnetlib/dhp/migration/MigrationUtils.java | 10 --- .../sql/queryDatasourceOrganization.sql | 5 +- .../sql/queryProjectOrganization.sql | 5 +- 4 files changed, 60 insertions(+), 45 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java index efc395812..6b537c840 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java @@ -19,6 +19,9 @@ import org.apache.commons.logging.LogFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.Journal; +import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Organization; import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Qualifier; @@ -84,10 +87,9 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp ds.setCollectedfrom(MigrationUtils.listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); ds.setPid(null); // List // TODO ds.setDateofcollection(rs.getDate("dateofcollection").toString()); - ds.setDateoftransformation(null); // TODO + ds.setDateoftransformation(null); // Value not returned by the SQL query ds.setExtraInfo(null); // TODO - ds.setOaiprovenance(null); // TODO - + ds.setOaiprovenance(null); // Values not present in the DB ds.setDatasourcetype(prepareQualifierSplitting(rs.getString("datasourcetype"))); ds.setOpenairecompatibility(prepareQualifierSplitting(rs.getString("openairecompatibility"))); ds.setOfficialname(MigrationUtils.field(rs.getString("officialname"), info)); @@ -104,9 +106,9 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp ds.setOdnumberofitems(MigrationUtils.field(Double.toString(rs.getInt("odnumberofitems")), info)); ds.setOdnumberofitemsdate(MigrationUtils.field(rs.getDate("odnumberofitemsdate").toString(), info)); ds.setOdpolicies(MigrationUtils.field(rs.getString("odpolicies"), info)); - ds.setOdlanguages(MigrationUtils.listFields(info, rs.getArray("odlanguages"))); - ds.setOdcontenttypes(MigrationUtils.listFields(info, rs.getArray("odcontenttypes"))); - ds.setAccessinfopackage(MigrationUtils.listFields(info, rs.getArray("accessinfopackage"))); + ds.setOdlanguages(prepareListFields(rs.getArray("odlanguages"), info)); + ds.setOdcontenttypes(prepareListFields(rs.getArray("odcontenttypes"), info)); + ds.setAccessinfopackage(prepareListFields(rs.getArray("accessinfopackage"), info)); ds.setReleasestartdate(MigrationUtils.field(rs.getDate("releasestartdate").toString(), info)); ds.setReleaseenddate(MigrationUtils.field(rs.getDate("releaseenddate").toString(), info)); ds.setMissionstatementurl(MigrationUtils.field(rs.getString("missionstatementurl"), info)); @@ -121,14 +123,13 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp ds.setQualitymanagementkind(MigrationUtils.field(rs.getString("qualitymanagementkind"), info)); ds.setPidsystems(MigrationUtils.field(rs.getString("pidsystems"), info)); ds.setCertificates(MigrationUtils.field(rs.getString("certificates"), info)); - ds.setPolicies(null); // List // TODO - ds.setJournal(null); // Journal // TODO - + ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array + ds.setJournal(prepareJournal(rs.getString("officialname"), rs.getString("journal"), info)); // Journal ds.setDataInfo(info); ds.setLastupdatetimestamp(lastUpdateTimestamp); // rs.getString("datasourceid"); - rs.getArray("identities"); + // rs.getArray("identities"); // rs.getString("officialname"); // rs.getString("englishname"); // rs.getString("contactemail"); @@ -166,14 +167,14 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp // rs.getString("qualitymanagementkind"); // rs.getString("pidsystems"); // rs.getString("certificates"); - rs.getArray("policies"); + // rs.getArray("policies"); // rs.getString("collectedfromid"); // rs.getString("collectedfromname"); - // rs.getString("datasourcetype"); // COMPLEX XXX@@@@.... + // rs.getString("datasourcetype"); // COMPLEX // rs.getString("provenanceaction"); // // 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' // AS provenanceaction, - rs.getString("journal"); // CONCAT(d.issn, '@@@', d.eissn, '@@@', d.lissn) AS journal + // rs.getString("journal"); // CONCAT(d.issn, '@@@', d.eissn, '@@@', d.lissn) AS journal emitOaf(ds); } catch (final Exception e) { @@ -192,12 +193,10 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp p.setOriginalId(Arrays.asList(rs.getString("projectid"))); p.setCollectedfrom(MigrationUtils.listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); p.setPid(null); // List // TODO - p.setDateofcollection(rs.getDate("dateofcollection").toString()); p.setDateoftransformation(rs.getDate("dateoftransformation").toString()); p.setExtraInfo(null); // List //TODO - p.setOaiprovenance(null); // OAIProvenance /TODO - + p.setOaiprovenance(null); // Values not present in the DB p.setWebsiteurl(MigrationUtils.field(rs.getString("websiteurl"), info)); p.setCode(MigrationUtils.field(rs.getString("code"), info)); p.setAcronym(MigrationUtils.field(rs.getString("acronym"), info)); @@ -211,7 +210,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp p.setOamandatepublications(MigrationUtils.field(Boolean.toString(rs.getBoolean("oamandatepublications")), info)); p.setEcarticle29_3(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecarticle29_3")), info)); p.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info)); - p.setFundingtree(null); // List> //TODO + p.setFundingtree(prepareListFields(rs.getArray("fundingtree"), info)); p.setContracttype(prepareQualifierSplitting(rs.getString("contracttype"))); p.setOptional1(MigrationUtils.field(rs.getString("optional1"), info)); p.setOptional2(MigrationUtils.field(rs.getString("optional2"), info)); @@ -224,7 +223,6 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp p.setCurrency(MigrationUtils.field(rs.getString("currency"), info)); p.setTotalcost(new Float(rs.getDouble("totalcost"))); p.setFundedamount(new Float(rs.getDouble("fundedamount"))); - p.setDataInfo(info); p.setLastupdatetimestamp(lastUpdateTimestamp); @@ -260,11 +258,11 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp // rs.getDouble("fundedamount"); // rs.getString("collectedfromid"); // rs.getString("collectedfromname"); - rs.getString("contracttype"); // COMPLEX - rs.getString("provenanceaction"); // COMPLEX - rs.getArray("pid"); - rs.getArray("subjects"); - rs.getArray("fundingtree"); + // rs.getString("contracttype"); // COMPLEX + // rs.getString("provenanceaction"); // COMPLEX + // rs.getArray("pid"); + // rs.getArray("subjects"); + // rs.getArray("fundingtree"); emitOaf(p); @@ -287,10 +285,10 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp o.setDateofcollection(rs.getDate("dateofcollection").toString()); o.setDateoftransformation(rs.getDate("dateoftransformation").toString()); o.setExtraInfo(null); // List // TODO - o.setOaiprovenance(null); // OAIProvenance // TODO + o.setOaiprovenance(null); // Values not present in the DB o.setLegalshortname(MigrationUtils.field("legalshortname", info)); o.setLegalname(MigrationUtils.field("legalname", info)); - o.setAlternativeNames(new ArrayList<>()); + o.setAlternativeNames(new ArrayList<>()); // Values not returned by the SQL query o.setWebsiteurl(MigrationUtils.field("websiteurl", info)); o.setLogourl(MigrationUtils.field("logourl", info)); o.setEclegalbody(MigrationUtils.field(Boolean.toString(rs.getBoolean("eclegalbody")), info)); @@ -305,7 +303,6 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp o.setEcsmevalidated(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info)); o.setEcnutscode(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecnutscode")), info)); o.setCountry(prepareQualifierSplitting(rs.getString("country"))); - o.setDataInfo(info); o.setLastupdatetimestamp(lastUpdateTimestamp); @@ -333,8 +330,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp // rs.getString("collectedfromid"); // rs.getString("collectedfromname"); // rs.getString("country"); - rs.getString("provenanceaction"); - rs.getArray("pid"); + // rs.getString("provenanceaction"); + // rs.getArray("pid"); emitOaf(o); } catch (final Exception e) { @@ -348,6 +345,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp final DataInfo info = prepareDataInfo(rs); final String orgId = MigrationUtils.createOpenaireId("20", rs.getString("organization")); final String dsId = MigrationUtils.createOpenaireId("10", rs.getString("datasource")); + final List collectedFrom = MigrationUtils.listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")); final Relation r1 = new Relation(); r1.setRelType("datasourceOrganization"); @@ -355,7 +353,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp r1.setRelClass("isProvidedBy"); r1.setSource(dsId); r1.setTarget(orgId); - r1.setCollectedFrom(null);// TODO + r1.setCollectedFrom(collectedFrom); r1.setDataInfo(info); r1.setLastupdatetimestamp(lastUpdateTimestamp); emitOaf(r1); @@ -366,7 +364,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp r2.setRelClass("provides"); r2.setSource(orgId); r2.setTarget(dsId); - r2.setCollectedFrom(null); // TODO + r2.setCollectedFrom(collectedFrom); r2.setDataInfo(info); r1.setLastupdatetimestamp(lastUpdateTimestamp); emitOaf(r2); @@ -395,6 +393,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp final DataInfo info = prepareDataInfo(rs); final String orgId = MigrationUtils.createOpenaireId("20", rs.getString("resporganization")); final String projectId = MigrationUtils.createOpenaireId("40", rs.getString("project")); + final List collectedFrom = MigrationUtils.listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")); final Relation r1 = new Relation(); r1.setRelType("projectOrganization"); @@ -402,7 +401,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp r1.setRelClass("isParticipant"); r1.setSource(projectId); r1.setTarget(orgId); - r1.setCollectedFrom(null);// TODO + r1.setCollectedFrom(collectedFrom); r1.setDataInfo(info); r1.setLastupdatetimestamp(lastUpdateTimestamp); emitOaf(r1); @@ -413,7 +412,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp r2.setRelClass("hasParticipant"); r2.setSource(orgId); r2.setTarget(projectId); - r2.setCollectedFrom(null); // TODO + r2.setCollectedFrom(collectedFrom); r2.setDataInfo(info); r1.setLastupdatetimestamp(lastUpdateTimestamp); emitOaf(r2); @@ -453,6 +452,14 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp return arr.length == 4 ? MigrationUtils.qualifier(arr[0], arr[1], arr[2], arr[3]) : null; } + public static List> prepareListFields(final Array array, final DataInfo info) { + try { + return MigrationUtils.listFields(info, (String[]) array.getArray()); + } catch (final SQLException e) { + throw new RuntimeException("Invalid SQL array", e); + } + } + private StructuredProperty prepareStructProp(final String s, final DataInfo dataInfo) { if (StringUtils.isBlank(s)) { return null; } final String[] parts = s.split("###"); @@ -478,6 +485,20 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp return res; } + private Journal prepareJournal(final String name, final String sj, final DataInfo info) { + if (StringUtils.isNotBlank(sj)) { + final String[] arr = sj.split("@@@"); + if (arr.length == 3) { + final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0] : null; + final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1] : null;; + final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2] : null;; + if (issn != null || eissn != null || lissn != null) { return MigrationUtils + .journal(name, issn, eissn, eissn, null, null, null, null, null, null, null, info); } + } + } + return null; + } + @Override public void close() throws IOException { super.close(); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrationUtils.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrationUtils.java index 8346a8041..c58688a79 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrationUtils.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrationUtils.java @@ -1,7 +1,5 @@ package eu.dnetlib.dhp.migration; -import java.sql.Array; -import java.sql.SQLException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -50,14 +48,6 @@ public class MigrationUtils { return Arrays.stream(values).map(v -> field(v, info)).collect(Collectors.toList()); } - public static List> listFields(final DataInfo info, final Array array) { - try { - return listFields(info, (String[]) array.getArray()); - } catch (final SQLException e) { - throw new RuntimeException("Invalid SQL array", e); - } - } - public static Qualifier qualifier(final String classid, final String classname, final String schemeid, final String schemename) { final Qualifier q = new Qualifier(); q.setClassid(classid); diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasourceOrganization.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasourceOrganization.sql index 885b6ae09..745f83971 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasourceOrganization.sql +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasourceOrganization.sql @@ -7,10 +7,11 @@ SELECT false AS deletedbyinference, 0.9 AS trust, NULL AS inferenceprovenance, - + dc.id AS collectedfromid, + dc.officialname AS collectedfromname, 'providedBy@@@provided by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies' AS semantics, d.provenanceaction || '@@@' || d.provenanceaction || '@@@dnet:provenanceActions@@@dnet:provenanceActions' AS provenanceaction FROM dsm_datasource_organization dor LEFT OUTER JOIN dsm_datasources d ON (dor.datasource = d.id) - + LEFT OUTER JOIN dsm_datasources dc ON (dc.id = d.collectedfrom) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjectOrganization.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjectOrganization.sql index 4483d6145..4c06ca5b9 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjectOrganization.sql +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjectOrganization.sql @@ -9,8 +9,11 @@ SELECT false AS deletedbyinference, po.trust AS trust, NULL AS inferenceprovenance, - + dc.id AS collectedfromid, + dc.officialname AS collectedfromname, po.semanticclass || '@@@' || po.semanticclass || '@@@dnet:project_organization_relations@@@dnet:project_organization_relations' AS semantics, 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction FROM project_organization po + LEFT OUTER JOIN projects p ON (p.id = po.project) + LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom) From f6eccdde33a0d7f6d0f42e362f2b4773c3d6f10c Mon Sep 17 00:00:00 2001 From: Michele Artini Date: Tue, 21 Jan 2020 14:17:05 +0100 Subject: [PATCH 10/45] partial implementation --- .../migration/AbstractMigrateApplication.java | 147 +++++++++++++++ .../MigrateDbEntitiesApplication.java | 167 +++++++++--------- 2 files changed, 230 insertions(+), 84 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrateApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrateApplication.java index a5c8b2775..b8f92fb9c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrateApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrateApplication.java @@ -3,8 +3,13 @@ package eu.dnetlib.dhp.migration; import java.io.Closeable; import java.io.IOException; import java.net.URI; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -13,7 +18,17 @@ import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.codehaus.jackson.map.ObjectMapper; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.ExtraInfo; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.Journal; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.OAIProvenance; import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.OriginDescription; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.utils.DHPUtils; public class AbstractMigrateApplication implements Closeable { @@ -58,4 +73,136 @@ public class AbstractMigrateApplication implements Closeable { writer.close(); } + public static KeyValue keyValue(final String k, final String v) { + final KeyValue kv = new KeyValue(); + kv.setKey(k); + kv.setValue(v); + return kv; + } + + public static List listKeyValues(final String... s) { + if (s.length % 2 > 0) { throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)"); } + + final List list = new ArrayList<>(); + for (int i = 0; i < s.length; i += 2) { + list.add(keyValue(s[i], s[i + 1])); + } + return list; + } + + public static Field field(final T value, final DataInfo info) { + final Field field = new Field<>(); + field.setValue(value); + field.setDataInfo(info); + return field; + } + + public static List> listFields(final DataInfo info, final String... values) { + return Arrays.stream(values).map(v -> field(v, info)).collect(Collectors.toList()); + } + + public static Qualifier qualifier(final String classid, final String classname, final String schemeid, final String schemename) { + final Qualifier q = new Qualifier(); + q.setClassid(classid); + q.setClassname(classname); + q.setSchemeid(schemeid); + q.setSchemename(schemename); + return q; + } + + public static StructuredProperty structuredProperty(final String value, + final String classid, + final String classname, + final String schemeid, + final String schemename, + final DataInfo dataInfo) { + final StructuredProperty sp = new StructuredProperty(); + sp.setValue(value); + sp.setQualifier(qualifier(classid, classname, schemeid, schemename)); + sp.setDataInfo(dataInfo); + return sp; + } + + public static ExtraInfo extraInfo(final String name, final String value, final String typology, final String provenance, final String trust) { + final ExtraInfo info = new ExtraInfo(); + info.setName(name); + info.setValue(value); + info.setTypology(typology); + info.setProvenance(provenance); + info.setTrust(trust); + return info; + } + + public static OAIProvenance oaiIProvenance(final String identifier, + final String baseURL, + final String metadataNamespace, + final Boolean altered, + final String datestamp, + final String harvestDate) { + + final OriginDescription desc = new OriginDescription(); + desc.setIdentifier(identifier); + desc.setBaseURL(baseURL); + desc.setMetadataNamespace(metadataNamespace); + desc.setAltered(altered); + desc.setDatestamp(datestamp); + desc.setHarvestDate(harvestDate); + + final OAIProvenance p = new OAIProvenance(); + p.setOriginDescription(desc); + + return p; + } + + public static Journal journal(final String name, + final String issnPrinted, + final String issnOnline, + final String issnLinking, + final String ep, + final String iss, + final String sp, + final String vol, + final String edition, + final String conferenceplace, + final String conferencedate, + final DataInfo dataInfo) { + final Journal j = new Journal(); + j.setName(name); + j.setIssnPrinted(issnPrinted); + j.setIssnOnline(issnOnline); + j.setIssnLinking(issnLinking); + j.setEp(ep); + j.setIss(iss); + j.setSp(sp); + j.setVol(vol); + j.setEdition(edition); + j.setConferenceplace(conferenceplace); + j.setConferencedate(conferencedate); + j.setDataInfo(dataInfo); + return j; + } + + public static DataInfo dataInfo(final Boolean deletedbyinference, + final String inferenceprovenance, + final Boolean inferred, + final Boolean invisible, + final Qualifier provenanceaction, + final String trust) { + final DataInfo d = new DataInfo(); + d.setDeletedbyinference(deletedbyinference); + d.setInferenceprovenance(inferenceprovenance); + d.setInferred(inferred); + d.setInvisible(invisible); + d.setProvenanceaction(provenanceaction); + d.setTrust(trust); + return d; + } + + public static String createOpenaireId(final String prefix, final String originalId) { + final String nsPrefix = StringUtils.substringBefore(originalId, "::"); + final String rest = StringUtils.substringAfter(originalId, "::"); + return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest)); + + } + } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java index 6b537c840..deb7fdd69 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java @@ -30,8 +30,8 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty; public class MigrateDbEntitiesApplication extends AbstractMigrateApplication implements Closeable { - private static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION = MigrationUtils - .qualifier("sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", "dnet:provenance_actions", "dnet:provenance_actions"); + private static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION = + qualifier("sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", "dnet:provenance_actions", "dnet:provenance_actions"); private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class); @@ -82,9 +82,9 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp final Datasource ds = new Datasource(); - ds.setId(MigrationUtils.createOpenaireId("10", rs.getString("datasourceid"))); + ds.setId(createOpenaireId("10", rs.getString("datasourceid"))); ds.setOriginalId(Arrays.asList(rs.getString("datasourceid"))); - ds.setCollectedfrom(MigrationUtils.listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); + ds.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); ds.setPid(null); // List // TODO ds.setDateofcollection(rs.getDate("dateofcollection").toString()); ds.setDateoftransformation(null); // Value not returned by the SQL query @@ -92,37 +92,37 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp ds.setOaiprovenance(null); // Values not present in the DB ds.setDatasourcetype(prepareQualifierSplitting(rs.getString("datasourcetype"))); ds.setOpenairecompatibility(prepareQualifierSplitting(rs.getString("openairecompatibility"))); - ds.setOfficialname(MigrationUtils.field(rs.getString("officialname"), info)); - ds.setEnglishname(MigrationUtils.field(rs.getString("englishname"), info)); - ds.setWebsiteurl(MigrationUtils.field(rs.getString("websiteurl"), info)); - ds.setLogourl(MigrationUtils.field(rs.getString("logourl"), info)); - ds.setContactemail(MigrationUtils.field(rs.getString("contactemail"), info)); - ds.setNamespaceprefix(MigrationUtils.field(rs.getString("namespaceprefix"), info)); - ds.setLatitude(MigrationUtils.field(Double.toString(rs.getDouble("latitude")), info)); - ds.setLongitude(MigrationUtils.field(Double.toString(rs.getDouble("longitude")), info)); - ds.setDateofvalidation(MigrationUtils.field(rs.getDate("dateofvalidation").toString(), info)); - ds.setDescription(MigrationUtils.field(rs.getString("description"), info)); + ds.setOfficialname(field(rs.getString("officialname"), info)); + ds.setEnglishname(field(rs.getString("englishname"), info)); + ds.setWebsiteurl(field(rs.getString("websiteurl"), info)); + ds.setLogourl(field(rs.getString("logourl"), info)); + ds.setContactemail(field(rs.getString("contactemail"), info)); + ds.setNamespaceprefix(field(rs.getString("namespaceprefix"), info)); + ds.setLatitude(field(Double.toString(rs.getDouble("latitude")), info)); + ds.setLongitude(field(Double.toString(rs.getDouble("longitude")), info)); + ds.setDateofvalidation(field(rs.getDate("dateofvalidation").toString(), info)); + ds.setDescription(field(rs.getString("description"), info)); ds.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info)); - ds.setOdnumberofitems(MigrationUtils.field(Double.toString(rs.getInt("odnumberofitems")), info)); - ds.setOdnumberofitemsdate(MigrationUtils.field(rs.getDate("odnumberofitemsdate").toString(), info)); - ds.setOdpolicies(MigrationUtils.field(rs.getString("odpolicies"), info)); + ds.setOdnumberofitems(field(Double.toString(rs.getInt("odnumberofitems")), info)); + ds.setOdnumberofitemsdate(field(rs.getDate("odnumberofitemsdate").toString(), info)); + ds.setOdpolicies(field(rs.getString("odpolicies"), info)); ds.setOdlanguages(prepareListFields(rs.getArray("odlanguages"), info)); ds.setOdcontenttypes(prepareListFields(rs.getArray("odcontenttypes"), info)); ds.setAccessinfopackage(prepareListFields(rs.getArray("accessinfopackage"), info)); - ds.setReleasestartdate(MigrationUtils.field(rs.getDate("releasestartdate").toString(), info)); - ds.setReleaseenddate(MigrationUtils.field(rs.getDate("releaseenddate").toString(), info)); - ds.setMissionstatementurl(MigrationUtils.field(rs.getString("missionstatementurl"), info)); - ds.setDataprovider(MigrationUtils.field(rs.getBoolean("dataprovider"), info)); - ds.setServiceprovider(MigrationUtils.field(rs.getBoolean("serviceprovider"), info)); - ds.setDatabaseaccesstype(MigrationUtils.field(rs.getString("databaseaccesstype"), info)); - ds.setDatauploadtype(MigrationUtils.field(rs.getString("datauploadtype"), info)); - ds.setDatabaseaccessrestriction(MigrationUtils.field(rs.getString("databaseaccessrestriction"), info)); - ds.setDatauploadrestriction(MigrationUtils.field(rs.getString("datauploadrestriction"), info)); - ds.setVersioning(MigrationUtils.field(rs.getBoolean("versioning"), info)); - ds.setCitationguidelineurl(MigrationUtils.field(rs.getString("citationguidelineurl"), info)); - ds.setQualitymanagementkind(MigrationUtils.field(rs.getString("qualitymanagementkind"), info)); - ds.setPidsystems(MigrationUtils.field(rs.getString("pidsystems"), info)); - ds.setCertificates(MigrationUtils.field(rs.getString("certificates"), info)); + ds.setReleasestartdate(field(rs.getDate("releasestartdate").toString(), info)); + ds.setReleaseenddate(field(rs.getDate("releaseenddate").toString(), info)); + ds.setMissionstatementurl(field(rs.getString("missionstatementurl"), info)); + ds.setDataprovider(field(rs.getBoolean("dataprovider"), info)); + ds.setServiceprovider(field(rs.getBoolean("serviceprovider"), info)); + ds.setDatabaseaccesstype(field(rs.getString("databaseaccesstype"), info)); + ds.setDatauploadtype(field(rs.getString("datauploadtype"), info)); + ds.setDatabaseaccessrestriction(field(rs.getString("databaseaccessrestriction"), info)); + ds.setDatauploadrestriction(field(rs.getString("datauploadrestriction"), info)); + ds.setVersioning(field(rs.getBoolean("versioning"), info)); + ds.setCitationguidelineurl(field(rs.getString("citationguidelineurl"), info)); + ds.setQualitymanagementkind(field(rs.getString("qualitymanagementkind"), info)); + ds.setPidsystems(field(rs.getString("pidsystems"), info)); + ds.setCertificates(field(rs.getString("certificates"), info)); ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array ds.setJournal(prepareJournal(rs.getString("officialname"), rs.getString("journal"), info)); // Journal ds.setDataInfo(info); @@ -189,38 +189,38 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp final Project p = new Project(); - p.setId(MigrationUtils.createOpenaireId("40", rs.getString("projectid"))); + p.setId(createOpenaireId("40", rs.getString("projectid"))); p.setOriginalId(Arrays.asList(rs.getString("projectid"))); - p.setCollectedfrom(MigrationUtils.listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); + p.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); p.setPid(null); // List // TODO p.setDateofcollection(rs.getDate("dateofcollection").toString()); p.setDateoftransformation(rs.getDate("dateoftransformation").toString()); p.setExtraInfo(null); // List //TODO p.setOaiprovenance(null); // Values not present in the DB - p.setWebsiteurl(MigrationUtils.field(rs.getString("websiteurl"), info)); - p.setCode(MigrationUtils.field(rs.getString("code"), info)); - p.setAcronym(MigrationUtils.field(rs.getString("acronym"), info)); - p.setTitle(MigrationUtils.field(rs.getString("title"), info)); - p.setStartdate(MigrationUtils.field(rs.getDate("startdate").toString(), info)); - p.setEnddate(MigrationUtils.field(rs.getDate("enddate").toString(), info)); - p.setCallidentifier(MigrationUtils.field(rs.getString("callidentifier"), info)); - p.setKeywords(MigrationUtils.field(rs.getString("keywords"), info)); - p.setDuration(MigrationUtils.field(Integer.toString(rs.getInt("duration")), info)); - p.setEcsc39(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecsc39")), info)); - p.setOamandatepublications(MigrationUtils.field(Boolean.toString(rs.getBoolean("oamandatepublications")), info)); - p.setEcarticle29_3(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecarticle29_3")), info)); + p.setWebsiteurl(field(rs.getString("websiteurl"), info)); + p.setCode(field(rs.getString("code"), info)); + p.setAcronym(field(rs.getString("acronym"), info)); + p.setTitle(field(rs.getString("title"), info)); + p.setStartdate(field(rs.getDate("startdate").toString(), info)); + p.setEnddate(field(rs.getDate("enddate").toString(), info)); + p.setCallidentifier(field(rs.getString("callidentifier"), info)); + p.setKeywords(field(rs.getString("keywords"), info)); + p.setDuration(field(Integer.toString(rs.getInt("duration")), info)); + p.setEcsc39(field(Boolean.toString(rs.getBoolean("ecsc39")), info)); + p.setOamandatepublications(field(Boolean.toString(rs.getBoolean("oamandatepublications")), info)); + p.setEcarticle29_3(field(Boolean.toString(rs.getBoolean("ecarticle29_3")), info)); p.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info)); p.setFundingtree(prepareListFields(rs.getArray("fundingtree"), info)); p.setContracttype(prepareQualifierSplitting(rs.getString("contracttype"))); - p.setOptional1(MigrationUtils.field(rs.getString("optional1"), info)); - p.setOptional2(MigrationUtils.field(rs.getString("optional2"), info)); - p.setJsonextrainfo(MigrationUtils.field(rs.getString("jsonextrainfo"), info)); - p.setContactfullname(MigrationUtils.field(rs.getString("contactfullname"), info)); - p.setContactfax(MigrationUtils.field(rs.getString("contactfax"), info)); - p.setContactphone(MigrationUtils.field(rs.getString("contactphone"), info)); - p.setContactemail(MigrationUtils.field(rs.getString("contactemail"), info)); - p.setSummary(MigrationUtils.field(rs.getString("summary"), info)); - p.setCurrency(MigrationUtils.field(rs.getString("currency"), info)); + p.setOptional1(field(rs.getString("optional1"), info)); + p.setOptional2(field(rs.getString("optional2"), info)); + p.setJsonextrainfo(field(rs.getString("jsonextrainfo"), info)); + p.setContactfullname(field(rs.getString("contactfullname"), info)); + p.setContactfax(field(rs.getString("contactfax"), info)); + p.setContactphone(field(rs.getString("contactphone"), info)); + p.setContactemail(field(rs.getString("contactemail"), info)); + p.setSummary(field(rs.getString("summary"), info)); + p.setCurrency(field(rs.getString("currency"), info)); p.setTotalcost(new Float(rs.getDouble("totalcost"))); p.setFundedamount(new Float(rs.getDouble("fundedamount"))); p.setDataInfo(info); @@ -278,30 +278,29 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp final Organization o = new Organization(); - o.setId(MigrationUtils.createOpenaireId("20", rs.getString("organizationid"))); // String id) { + o.setId(createOpenaireId("20", rs.getString("organizationid"))); // String id) { o.setOriginalId(Arrays.asList(rs.getString("organizationid"))); - o.setCollectedfrom(MigrationUtils.listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); + o.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); o.setPid(null); // List // TODO o.setDateofcollection(rs.getDate("dateofcollection").toString()); o.setDateoftransformation(rs.getDate("dateoftransformation").toString()); o.setExtraInfo(null); // List // TODO o.setOaiprovenance(null); // Values not present in the DB - o.setLegalshortname(MigrationUtils.field("legalshortname", info)); - o.setLegalname(MigrationUtils.field("legalname", info)); + o.setLegalshortname(field("legalshortname", info)); + o.setLegalname(field("legalname", info)); o.setAlternativeNames(new ArrayList<>()); // Values not returned by the SQL query - o.setWebsiteurl(MigrationUtils.field("websiteurl", info)); - o.setLogourl(MigrationUtils.field("logourl", info)); - o.setEclegalbody(MigrationUtils.field(Boolean.toString(rs.getBoolean("eclegalbody")), info)); - o.setEclegalperson(MigrationUtils.field(Boolean.toString(rs.getBoolean("eclegalperson")), info)); - o.setEcnonprofit(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecnonprofit")), info)); - o.setEcresearchorganization(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecresearchorganization")), info)); - o.setEchighereducation(MigrationUtils.field(Boolean.toString(rs.getBoolean("echighereducation")), info)); - o.setEcinternationalorganizationeurinterests(MigrationUtils - .field(Boolean.toString(rs.getBoolean("ecinternationalorganizationeurinterests")), info)); - o.setEcinternationalorganization(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecinternationalorganization")), info)); - o.setEcenterprise(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecenterprise")), info)); - o.setEcsmevalidated(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info)); - o.setEcnutscode(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecnutscode")), info)); + o.setWebsiteurl(field("websiteurl", info)); + o.setLogourl(field("logourl", info)); + o.setEclegalbody(field(Boolean.toString(rs.getBoolean("eclegalbody")), info)); + o.setEclegalperson(field(Boolean.toString(rs.getBoolean("eclegalperson")), info)); + o.setEcnonprofit(field(Boolean.toString(rs.getBoolean("ecnonprofit")), info)); + o.setEcresearchorganization(field(Boolean.toString(rs.getBoolean("ecresearchorganization")), info)); + o.setEchighereducation(field(Boolean.toString(rs.getBoolean("echighereducation")), info)); + o.setEcinternationalorganizationeurinterests(field(Boolean.toString(rs.getBoolean("ecinternationalorganizationeurinterests")), info)); + o.setEcinternationalorganization(field(Boolean.toString(rs.getBoolean("ecinternationalorganization")), info)); + o.setEcenterprise(field(Boolean.toString(rs.getBoolean("ecenterprise")), info)); + o.setEcsmevalidated(field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info)); + o.setEcnutscode(field(Boolean.toString(rs.getBoolean("ecnutscode")), info)); o.setCountry(prepareQualifierSplitting(rs.getString("country"))); o.setDataInfo(info); o.setLastupdatetimestamp(lastUpdateTimestamp); @@ -343,9 +342,9 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp try { final DataInfo info = prepareDataInfo(rs); - final String orgId = MigrationUtils.createOpenaireId("20", rs.getString("organization")); - final String dsId = MigrationUtils.createOpenaireId("10", rs.getString("datasource")); - final List collectedFrom = MigrationUtils.listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")); + final String orgId = createOpenaireId("20", rs.getString("organization")); + final String dsId = createOpenaireId("10", rs.getString("datasource")); + final List collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")); final Relation r1 = new Relation(); r1.setRelType("datasourceOrganization"); @@ -391,9 +390,9 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp public void processProjectOrganization(final ResultSet rs) { try { final DataInfo info = prepareDataInfo(rs); - final String orgId = MigrationUtils.createOpenaireId("20", rs.getString("resporganization")); - final String projectId = MigrationUtils.createOpenaireId("40", rs.getString("project")); - final List collectedFrom = MigrationUtils.listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")); + final String orgId = createOpenaireId("20", rs.getString("resporganization")); + final String projectId = createOpenaireId("40", rs.getString("project")); + final List collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")); final Relation r1 = new Relation(); r1.setRelType("projectOrganization"); @@ -443,18 +442,18 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp final String inferenceprovenance = rs.getString("inferenceprovenance"); final Boolean inferred = rs.getBoolean("inferred"); final String trust = rs.getString("trust"); - return MigrationUtils.dataInfo(deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, trust); + return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, trust); } private Qualifier prepareQualifierSplitting(final String s) { if (StringUtils.isBlank(s)) { return null; } final String[] arr = s.split("@@@"); - return arr.length == 4 ? MigrationUtils.qualifier(arr[0], arr[1], arr[2], arr[3]) : null; + return arr.length == 4 ? qualifier(arr[0], arr[1], arr[2], arr[3]) : null; } - public static List> prepareListFields(final Array array, final DataInfo info) { + private static List> prepareListFields(final Array array, final DataInfo info) { try { - return MigrationUtils.listFields(info, (String[]) array.getArray()); + return listFields(info, (String[]) array.getArray()); } catch (final SQLException e) { throw new RuntimeException("Invalid SQL array", e); } @@ -466,7 +465,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp if (parts.length == 2) { final String value = parts[0]; final String[] arr = parts[1].split("@@@"); - if (arr.length == 4) { return MigrationUtils.structuredProperty(value, arr[0], arr[1], arr[2], arr[3], dataInfo); } + if (arr.length == 4) { return structuredProperty(value, arr[0], arr[1], arr[2], arr[3], dataInfo); } } return null; } @@ -492,8 +491,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0] : null; final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1] : null;; final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2] : null;; - if (issn != null || eissn != null || lissn != null) { return MigrationUtils - .journal(name, issn, eissn, eissn, null, null, null, null, null, null, null, info); } + if (issn != null || eissn != null + || lissn != null) { return journal(name, issn, eissn, eissn, null, null, null, null, null, null, null, info); } } } return null; From 799929c1e3e40534f89be315d1884eb135e63516 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 21 Jan 2020 16:35:44 +0100 Subject: [PATCH 11/45] joining entities using T x R x S method with groupByKey --- dhp-schemas/pom.xml | 2 +- .../eu/dnetlib/dhp/schema/dli/Entity.java | 118 ----------- .../java/eu/dnetlib/dhp/schema/dli/Pid.java | 33 ---- .../eu/dnetlib/dhp/schema/dli/Provenance.java | 35 ---- .../eu/dnetlib/dhp/schema/dli/Relation.java | 47 ----- .../dhp/schema/dli/RelationSemantic.java | 16 -- .../eu/dnetlib/dhp/schema/dli/Subject.java | 35 ---- .../job-override.properties | 1 - dhp-workflows/dhp-graph-provision/pom.xml | 4 + .../java/eu/dnetlib/dhp/graph/EntityNode.java | 4 - .../eu/dnetlib/dhp/graph/EntityRelEntity.java | 32 +-- .../eu/dnetlib/dhp/graph/GraphJoiner.java | 186 ++++++++---------- .../eu/dnetlib/dhp/graph/LinkedEntity.java | 29 +++ .../eu/dnetlib/dhp/graph/RelatedEntity.java | 69 ------- .../dhp/graph/SparkGraphIndexingJob.java | 17 +- .../main/java/eu/dnetlib/dhp/graph/Tuple.java | 31 +++ .../java/eu/dnetlib/dhp/graph/TypedRow.java | 52 +++-- .../dhp/graph/oozie_app/config-default.xml | 8 + .../dnetlib/dhp/graph/oozie_app/workflow.xml | 19 +- 19 files changed, 238 insertions(+), 500 deletions(-) delete mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Entity.java delete mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Pid.java delete mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Provenance.java delete mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Relation.java delete mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/RelationSemantic.java delete mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Subject.java delete mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityNode.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntity.java delete mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/RelatedEntity.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Tuple.java diff --git a/dhp-schemas/pom.xml b/dhp-schemas/pom.xml index ec5af8d3c..491cbe668 100644 --- a/dhp-schemas/pom.xml +++ b/dhp-schemas/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib.dhp dhp - 1.1.5-SNAPSHOT + 1.0.5-SNAPSHOT ../ diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Entity.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Entity.java deleted file mode 100644 index 894d54eaf..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Entity.java +++ /dev/null @@ -1,118 +0,0 @@ -package eu.dnetlib.dhp.schema.dli; - -import java.io.Serializable; -import java.util.List; - -public class Entity implements Serializable { - - private String identifier; - - private List pid; - - private List title; - - private List date; - - private String typology; - - private List authors; - - private List subject; - - private String description; - - private String completionStatus; - - private List collectedFrom; - - private List publisher; - - - public String getIdentifier() { - return identifier; - } - - public void setIdentifier(String identifier) { - this.identifier = identifier; - } - - public List getPid() { - return pid; - } - - public void setPid(List pid) { - this.pid = pid; - } - - public List getTitle() { - return title; - } - - public void setTitle(List title) { - this.title = title; - } - - public List getDate() { - return date; - } - - public void setDate(List date) { - this.date = date; - } - - public String getTypology() { - return typology; - } - - public void setTypology(String typology) { - this.typology = typology; - } - - public List getAuthors() { - return authors; - } - - public void setAuthors(List authors) { - this.authors = authors; - } - - public List getSubject() { - return subject; - } - - public void setSubject(List subject) { - this.subject = subject; - } - - public String getDescription() { - return description; - } - - public void setDescription(String description) { - this.description = description; - } - - public List getCollectedFrom() { - return collectedFrom; - } - - public void setCollectedFrom(List collectedFrom) { - this.collectedFrom = collectedFrom; - } - - public List getPublisher() { - return publisher; - } - - public void setPublisher(List publisher) { - this.publisher = publisher; - } - - public String getCompletionStatus() { - return completionStatus; - } - - public void setCompletionStatus(String completionStatus) { - this.completionStatus = completionStatus; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Pid.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Pid.java deleted file mode 100644 index 252245f45..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Pid.java +++ /dev/null @@ -1,33 +0,0 @@ -package eu.dnetlib.dhp.schema.dli; - -import eu.dnetlib.dhp.utils.DHPUtils; -import org.apache.commons.lang3.StringUtils; - -public class Pid { - - private String pid; - - private String pidType; - - public String getPid() { - return pid; - } - - public void setPid(String pid) { - this.pid = pid; - } - - public String getPidType() { - return pidType; - } - - public void setPidType(String pidType) { - this.pidType = pidType; - } - - public String generateId() { - if(StringUtils.isEmpty(pid) || StringUtils.isEmpty(pidType)) - return null; - return DHPUtils.md5(String.format("%s::%s", pid, pidType)); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Provenance.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Provenance.java deleted file mode 100644 index 300b1134b..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Provenance.java +++ /dev/null @@ -1,35 +0,0 @@ -package eu.dnetlib.dhp.schema.dli; - -public class Provenance { - - private String datasourceId; - - private String datasourceName; - - private String completionStatus; - - - public String getDatasourceId() { - return datasourceId; - } - - public void setDatasourceId(String datasourceId) { - this.datasourceId = datasourceId; - } - - public String getDatasourceName() { - return datasourceName; - } - - public void setDatasourceName(String datasourceName) { - this.datasourceName = datasourceName; - } - - public String getCompletionStatus() { - return completionStatus; - } - - public void setCompletionStatus(String completionStatus) { - this.completionStatus = completionStatus; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Relation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Relation.java deleted file mode 100644 index b83cccb73..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Relation.java +++ /dev/null @@ -1,47 +0,0 @@ -package eu.dnetlib.dhp.schema.dli; - -import java.io.Serializable; -import java.util.List; - -public class Relation implements Serializable { - - private String source; - - private String target; - - private List provenance; - - private RelationSemantic semantic; - - public String getSource() { - return source; - } - - public void setSource(String source) { - this.source = source; - } - - public String getTarget() { - return target; - } - - public void setTarget(String target) { - this.target = target; - } - - public List getProvenance() { - return provenance; - } - - public void setProvenance(List provenance) { - this.provenance = provenance; - } - - public RelationSemantic getSemantic() { - return semantic; - } - - public void setSemantic(RelationSemantic semantic) { - this.semantic = semantic; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/RelationSemantic.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/RelationSemantic.java deleted file mode 100644 index ff871ef2d..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/RelationSemantic.java +++ /dev/null @@ -1,16 +0,0 @@ -package eu.dnetlib.dhp.schema.dli; - -import java.io.Serializable; - -public class RelationSemantic extends Subject implements Serializable { - - public String inverse; - - public String getInverse() { - return inverse; - } - - public void setInverse(String inverse) { - this.inverse = inverse; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Subject.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Subject.java deleted file mode 100644 index bd89bc6dd..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dli/Subject.java +++ /dev/null @@ -1,35 +0,0 @@ -package eu.dnetlib.dhp.schema.dli; - -import java.io.Serializable; - -public class Subject implements Serializable { - - private String schema; - - private String value; - - public Subject() { - - } - - public Subject(String schema, String value) { - this.schema = schema; - this.value = value; - } - - public String getSchema() { - return schema; - } - - public void setSchema(String schema) { - this.schema = schema; - } - - public String getValue() { - return value; - } - - public void setValue(String value) { - this.value = value; - } -} diff --git a/dhp-workflows/dhp-graph-provision/job-override.properties b/dhp-workflows/dhp-graph-provision/job-override.properties index 882053c1a..1870b0e6e 100644 --- a/dhp-workflows/dhp-graph-provision/job-override.properties +++ b/dhp-workflows/dhp-graph-provision/job-override.properties @@ -1,5 +1,4 @@ sparkDriverMemory=7G sparkExecutorMemory=7G -sparkExecutorMemoryOverhead=5G hive_db_name=claudio sourcePath=/tmp/db_openaireplus_services_beta.export.2019.11.06 \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml index d47463774..62d8ac2ae 100644 --- a/dhp-workflows/dhp-graph-provision/pom.xml +++ b/dhp-workflows/dhp-graph-provision/pom.xml @@ -19,6 +19,10 @@ org.apache.spark spark-sql_2.11 + + com.jayway.jsonpath + json-path + eu.dnetlib.dhp diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityNode.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityNode.java deleted file mode 100644 index be1babae2..000000000 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityNode.java +++ /dev/null @@ -1,4 +0,0 @@ -package eu.dnetlib.dhp.graph; - -public class EntityNode { -} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityRelEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityRelEntity.java index ac89e4351..e8ecc2e30 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityRelEntity.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityRelEntity.java @@ -1,20 +1,30 @@ package eu.dnetlib.dhp.graph; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.schema.oaf.Relation; - import java.io.Serializable; public class EntityRelEntity implements Serializable { private TypedRow source; - private Relation relation; + private TypedRow relation; private TypedRow target; + public EntityRelEntity() { + } + public EntityRelEntity(TypedRow source) { this.source = source; } + + //helpers + public Boolean hasMainEntity() { + return getSource() != null & getRelation() == null & getTarget() == null; + } + + public Boolean hasRelatedEntity() { + return getSource() == null & getRelation() != null & getTarget() != null; + } + + public TypedRow getSource() { return source; } @@ -24,11 +34,11 @@ public class EntityRelEntity implements Serializable { return this; } - public Relation getRelation() { + public TypedRow getRelation() { return relation; } - public EntityRelEntity setRelation(Relation relation) { + public EntityRelEntity setRelation(TypedRow relation) { this.relation = relation; return this; } @@ -42,12 +52,4 @@ public class EntityRelEntity implements Serializable { return this; } - @Override - public String toString() { - try { - return new ObjectMapper().writeValueAsString(this); - } catch (JsonProcessingException e) { - throw new RuntimeException(e); - } - } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java index 5764642dc..aca436f52 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java @@ -1,139 +1,119 @@ package eu.dnetlib.dhp.graph; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.schema.oaf.*; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.jayway.jsonpath.JsonPath; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.Optional; import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import scala.Tuple2; import java.io.Serializable; +import java.util.List; public class GraphJoiner implements Serializable { - public static final int MAX_RELS = 100; + public static final int MAX_RELS = 10; public void join(final SparkSession spark, final String inputPath, final String hiveDbName, final String outPath) { final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - /* - JavaPairRDD entities = sc.sequenceFile(inputPath + "/publication", Text.class, Text.class) - .map(item -> new ObjectMapper().readValue(item._2().toString(), Publication.class)) - .map(oaf -> new TypedRow("publication", oaf)) - .mapToPair(toPair()); + final String entityIdPath = "$.id"; - */ + JavaPairRDD datasource = readPathEntity(sc, entityIdPath, inputPath, "datasource"); + JavaPairRDD organization = readPathEntity(sc, entityIdPath, inputPath, "organization"); + JavaPairRDD project = readPathEntity(sc, entityIdPath, inputPath, "project"); + JavaPairRDD dataset = readPathEntity(sc, entityIdPath, inputPath, "dataset"); + JavaPairRDD otherresearchproduct = readPathEntity(sc, entityIdPath, inputPath, "otherresearchproduct"); + JavaPairRDD software = readPathEntity(sc, entityIdPath, inputPath, "software"); + JavaPairRDD publication = readPathEntity(sc, entityIdPath, inputPath, "publication"); - JavaPairRDD entities = sc.sequenceFile(inputPath + "/datasource", Text.class, Text.class) - .map(item -> new ObjectMapper().readValue(item._2().toString(), Datasource.class)) - .map(oaf -> new TypedRow("datasource", oaf)) - .mapToPair(toPair()) - .union(sc.sequenceFile(inputPath + "/organization", Text.class, Text.class) - .map(item -> new ObjectMapper().readValue(item._2().toString(), Organization.class)) - .map(oaf -> new TypedRow("organization", oaf)) - .mapToPair(toPair())) - .union(sc.sequenceFile(inputPath + "/project", Text.class, Text.class) - .map(item -> new ObjectMapper().readValue(item._2().toString(), Project.class)) - .map(oaf -> new TypedRow("project", oaf)) - .mapToPair(toPair())) - .union(sc.sequenceFile(inputPath + "/dataset", Text.class, Text.class) - .map(item -> new ObjectMapper().readValue(item._2().toString(), Dataset.class)) - .map(oaf -> new TypedRow("dataset", oaf)) - .mapToPair(toPair())) - .union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class) - .map(item -> new ObjectMapper().readValue(item._2().toString(), OtherResearchProduct.class)) - .map(oaf -> new TypedRow("otherresearchproduct", oaf)) - .mapToPair(toPair())) - .union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class) - .map(item -> new ObjectMapper().readValue(item._2().toString(), Software.class)) - .map(oaf -> new TypedRow("software", oaf)) - .mapToPair(toPair())); - /* - .union(sc.sequenceFile(inputPath + "/publication", Text.class, Text.class) - .map(item -> new ObjectMapper().readValue(item._2().toString(), Publication.class)) - .map(oaf -> new TypedRow("publication", oaf)) - .mapToPair(toPair())); + final String entitiesPath = outPath + "/entities"; + datasource + .union(organization) + .union(project) + .union(dataset) + .union(otherresearchproduct) + .union(software) + .union(publication) + .map(e -> new EntityRelEntity().setSource(e._2())) + .map(e -> new ObjectMapper().writeValueAsString(e)) + .saveAsTextFile(entitiesPath, GzipCodec.class); - */ + JavaPairRDD entities = sc.textFile(entitiesPath) + .map(t -> new ObjectMapper().readValue(t, EntityRelEntity.class)) + .mapToPair(t -> new Tuple2<>(t.getSource().getSource(), t)); - /* - JavaRDD rels = sc.sequenceFile(inputPath + "/relation", Text.class, Text.class) - .map(item -> new ObjectMapper().readValue(item._2().toString(), Relation.class)) - .map(oaf -> new TypedRow("relation", oaf)) - .mapToPair(toPair()) + final JavaPairRDD relation = readPathRelation(sc, inputPath) + .map(p -> new EntityRelEntity().setRelation(p)) + .mapToPair(p -> new Tuple2<>(p.getRelation().getSource(), p)) .groupByKey() - .map(t -> Iterables.limit(t._2(), MAX_RELS)) - .flatMap(t -> t.iterator()) - .map(t -> (Relation) t.getOaf()); + .map(p -> Iterables.limit(p._2(), MAX_RELS)) + .flatMap(p -> p.iterator()) + .mapToPair(p -> new Tuple2<>(p.getRelation().getTarget(), p)); - spark.createDataset(rels.rdd(), Encoders.bean(Relation.class)) - .write() - .mode(SaveMode.Overwrite) - .saveAsTable(hiveDbName + ".relation_100"); - */ + final String joinByTargetPath = outPath + "/join_by_target"; + relation.join(entities) + .map(s -> new EntityRelEntity() + .setRelation(s._2()._1().getRelation()) + .setTarget(s._2()._2().getSource())) + .map(e -> new ObjectMapper().writeValueAsString(e)) + .saveAsTextFile(joinByTargetPath, GzipCodec.class); - JavaPairRDD bounded_rels = spark.table(hiveDbName + ".relation_" + MAX_RELS) - .as(Encoders.bean(Relation.class)) - .javaRDD() - .map(r -> new TypedRow("relation", r)) - .mapToPair(toPair()); - // build the adjacency list: e -> r - JavaPairRDD>> adjacency_list = entities.leftOuterJoin(bounded_rels); + JavaPairRDD bySource = sc.textFile(joinByTargetPath) + .map(t -> new ObjectMapper().readValue(t, EntityRelEntity.class)) + .mapToPair(t -> new Tuple2<>(t.getRelation().getSource(), t)); - JavaRDD linked_entities = adjacency_list - .mapToPair(toPairTarget()) // make rel.targetid explicit so that we can join it - .leftOuterJoin(entities) // again with the entities to get the target entity - .map(l -> toEntityRelEntity(l)); // and map it to a more readable representation - - spark.createDataFrame(linked_entities, EntityRelEntity.class) - .write() - .mode(SaveMode.Overwrite) - .saveAsTable(hiveDbName + ".linked_entities"); + entities + .union(bySource) + .groupByKey() // by source id + .map(p -> { + final LinkedEntity e = new LinkedEntity(); + final List links = Lists.newArrayList(); + for(EntityRelEntity rel : p._2()) { + if (rel.hasMainEntity() & e.getEntity() == null) { + e.setEntity(rel.getSource()); + } + if (rel.hasRelatedEntity()) { + links.add(new Tuple() + .setRelation(rel.getRelation()) + .setTarget(rel.getTarget())); + } + } + e.setLinks(links); + if (e.getEntity() == null) { + throw new IllegalStateException("missing main entity on '" + p._1() + "'"); + } + return e; + }) + .map(e -> new ObjectMapper().writeValueAsString(e)) + .saveAsTextFile(outPath + "/linked_entities", GzipCodec.class); } - private EntityRelEntity toEntityRelEntity(Tuple2>>, Optional>> l) { - // extract the entity source - final EntityRelEntity res = new EntityRelEntity(l._2()._1()._2()._1()); - - if(l._2()._1()._2()._2().isPresent() && l._2()._2().isPresent()) { - - // extract the relationship - res.setRelation((Relation) l._2()._1()._2()._2().get().getOaf()); - - // extract the related entity - res.setTarget(l._2()._2().get()); - } - - return res; + private JavaPairRDD readPathEntity(final JavaSparkContext sc, final String idPath, final String inputPath, final String type) { + return sc.sequenceFile(inputPath + "/" + type, Text.class, Text.class) + .mapToPair((PairFunction, String, TypedRow>) item -> { + final String json = item._2().toString(); + final String id = JsonPath.read(json, idPath); + return new Tuple2<>(id, new TypedRow(id, type, json)); + }); } - private PairFunction>>, String, Tuple2>>> toPairTarget() { - return e -> { - Optional o = e._2()._2(); - if (o.isPresent()) { - return new Tuple2<>(((Relation) o.get().getOaf()).getTarget(), e); - } else { - return new Tuple2<>(null, e); - } - }; - } - - private PairFunction toPair() { - return e -> { - if (!"relation".equals(e.getType())) { - return new Tuple2<>( ((OafEntity) e.getOaf()).getId(), e); - } else { - return new Tuple2<>( ((Relation) e.getOaf()).getSource(), e); - } - }; + private JavaRDD readPathRelation(final JavaSparkContext sc, final String inputPath) { + return sc.sequenceFile(inputPath + "/relation", Text.class, Text.class) + .map(item -> { + final String json = item._2().toString(); + final String source = JsonPath.read(json, "$.source"); + final String target = JsonPath.read(json, "$.target"); + return new TypedRow(source, target, "relation", json); + }); } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntity.java new file mode 100644 index 000000000..9e6fc0d38 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntity.java @@ -0,0 +1,29 @@ +package eu.dnetlib.dhp.graph; + +import java.io.Serializable; +import java.util.List; + +public class LinkedEntity implements Serializable { + + private TypedRow entity; + + private List links; + + public TypedRow getEntity() { + return entity; + } + + public LinkedEntity setEntity(TypedRow entity) { + this.entity = entity; + return this; + } + + public List getLinks() { + return links; + } + + public LinkedEntity setLinks(List links) { + this.links = links; + return this; + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/RelatedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/RelatedEntity.java deleted file mode 100644 index dbab04f16..000000000 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/RelatedEntity.java +++ /dev/null @@ -1,69 +0,0 @@ -package eu.dnetlib.dhp.graph; - -import java.io.Serializable; - -public class RelatedEntity implements Serializable { - - private String relType; - - private String subRelType; - - private String relClass; - - private String type; - - private String payload; - - public RelatedEntity(String relType, String subRelType, String relClass, String type, String payload) { - this.relType = relType; - this.subRelType = subRelType; - this.relClass = relClass; - this.type = type; - this.payload = payload; - } - - public String getRelType() { - return relType; - } - - public RelatedEntity setRelType(String relType) { - this.relType = relType; - return this; - } - - public String getSubRelType() { - return subRelType; - } - - public RelatedEntity setSubRelType(String subRelType) { - this.subRelType = subRelType; - return this; - } - - public String getRelClass() { - return relClass; - } - - public RelatedEntity setRelClass(String relClass) { - this.relClass = relClass; - return this; - } - - public String getType() { - return type; - } - - public RelatedEntity setType(String type) { - this.type = type; - return this; - } - - public String getPayload() { - return payload; - } - - public RelatedEntity setPayload(String payload) { - this.payload = payload; - return this; - } -} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkGraphIndexingJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkGraphIndexingJob.java index ce8e7e396..1d55dda89 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkGraphIndexingJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkGraphIndexingJob.java @@ -4,21 +4,27 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser; import org.apache.commons.io.IOUtils; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.spark.SparkConf; import org.apache.spark.sql.SparkSession; public class SparkGraphIndexingJob { - private final static String ENTITY_NODES_PATH = "/tmp/entity_node"; + private final static String OUTPUT_BASE_PATH = "/tmp/openaire_provision"; public static void main(String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGraphIndexingJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_graph_parameters.json"))); parser.parseArgument(args); + + final SparkConf conf = new SparkConf() + .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + .set("hive.metastore.uris", parser.get("hive_metastore_uris")); + final SparkSession spark = SparkSession .builder() + .config(conf) .appName(SparkGraphIndexingJob.class.getSimpleName()) .master(parser.get("master")) - .config("hive.metastore.uris", parser.get("hive_metastore_uris")) .enableHiveSupport() .getOrCreate(); @@ -26,11 +32,12 @@ public class SparkGraphIndexingJob { final String hiveDbName = parser.get("hive_db_name"); final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); - if (fs.exists(new Path(ENTITY_NODES_PATH))) { - fs.delete(new Path(ENTITY_NODES_PATH), true); + if (fs.exists(new Path(OUTPUT_BASE_PATH))) { + fs.delete(new Path(OUTPUT_BASE_PATH), true); + fs.mkdirs(new Path(OUTPUT_BASE_PATH)); } - new GraphJoiner().join(spark, inputPath, hiveDbName, ENTITY_NODES_PATH); + new GraphJoiner().join(spark, inputPath, hiveDbName, OUTPUT_BASE_PATH); } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Tuple.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Tuple.java new file mode 100644 index 000000000..0b22a63a5 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Tuple.java @@ -0,0 +1,31 @@ +package eu.dnetlib.dhp.graph; + +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.Relation; + +import java.io.Serializable; + +public class Tuple implements Serializable { + + private TypedRow relation; + + private TypedRow target; + + public TypedRow getRelation() { + return relation; + } + + public Tuple setRelation(TypedRow relation) { + this.relation = relation; + return this; + } + + public TypedRow getTarget() { + return target; + } + + public Tuple setTarget(TypedRow target) { + this.target = target; + return this; + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TypedRow.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TypedRow.java index 5c933ca80..60c3b64b2 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TypedRow.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TypedRow.java @@ -1,20 +1,46 @@ package eu.dnetlib.dhp.graph; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.schema.oaf.Oaf; - import java.io.Serializable; public class TypedRow implements Serializable { - private String type; - private Oaf oaf; - public TypedRow(String type, Oaf oaf) { + private String source; + private String target; + private String type; + private String oaf; + + public TypedRow() { + } + + public TypedRow(String source, String type, String oaf) { + this.source = source; this.type = type; this.oaf = oaf; } + public TypedRow(String source, String target, String type, String oaf) { + this(source, type, oaf); + this.target = target; + } + + public String getSource() { + return source; + } + + public TypedRow setSource(String source) { + this.source = source; + return this; + } + + public String getTarget() { + return target; + } + + public TypedRow setTarget(String target) { + this.target = target; + return this; + } + public String getType() { return type; } @@ -24,21 +50,13 @@ public class TypedRow implements Serializable { return this; } - public Oaf getOaf() { + public String getOaf() { return oaf; } - public TypedRow setOaf(Oaf oaf) { + public TypedRow setOaf(String oaf) { this.oaf = oaf; return this; } - @Override - public String toString() { - try { - return new ObjectMapper().writeValueAsString(this); - } catch (JsonProcessingException e) { - throw new RuntimeException(e); - } - } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml index fcab9dd00..624d3ea76 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml @@ -23,4 +23,12 @@ hive_db_name openaire + + spark2YarnHistoryServerAddress + http://iis-cdh5-test-gw.ocean.icm.edu.pl:18088 + + + spark2EventLogDir + /user/spark/applicationHistory + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml index 00a890268..a91759ade 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml @@ -16,6 +16,14 @@ sparkExecutorCores number of cores used by single executor + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + @@ -33,7 +41,16 @@ GraphIndexing eu.dnetlib.dhp.graph.SparkGraphIndexingJob dhp-graph-provision-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" --conf spark.yarn.executor.memoryOverhead=${sparkExecutorMemoryOverhead} + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.sql.warehouse.dir="/user/hive/warehouse" + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + -mt yarn-cluster --sourcePath${sourcePath} --hive_db_name${hive_db_name} From 6bfe2dc96e0bb9a36008d3157dde36b0269060c9 Mon Sep 17 00:00:00 2001 From: Michele Artini Date: Wed, 22 Jan 2020 16:00:23 +0100 Subject: [PATCH 12/45] partial implementation --- .../migration/AbstractMigrateApplication.java | 8 +- .../MigrateDbEntitiesApplication.java | 29 +- .../MigrateMongoMdstoresApplication.java | 265 +++++++++++++++--- .../migrate_mongo_mstores_parameters.json | 18 ++ 4 files changed, 258 insertions(+), 62 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrateApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrateApplication.java index b8f92fb9c..73ee7f822 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrateApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrateApplication.java @@ -116,9 +116,13 @@ public class AbstractMigrateApplication implements Closeable { final String schemeid, final String schemename, final DataInfo dataInfo) { + return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo); + } + + public static StructuredProperty structuredProperty(final String value, final Qualifier qualifier, final DataInfo dataInfo) { final StructuredProperty sp = new StructuredProperty(); sp.setValue(value); - sp.setQualifier(qualifier(classid, classname, schemeid, schemename)); + sp.setQualifier(qualifier); sp.setDataInfo(dataInfo); return sp; } @@ -198,7 +202,7 @@ public class AbstractMigrateApplication implements Closeable { return d; } - public static String createOpenaireId(final String prefix, final String originalId) { + public static String createOpenaireId(final int prefix, final String originalId) { final String nsPrefix = StringUtils.substringBefore(originalId, "::"); final String rest = StringUtils.substringAfter(originalId, "::"); return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest)); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java index deb7fdd69..0b47c5282 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java @@ -60,7 +60,6 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization); smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization); } - } public MigrateDbEntitiesApplication(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String dbUrl, final String dbUser, @@ -82,13 +81,13 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp final Datasource ds = new Datasource(); - ds.setId(createOpenaireId("10", rs.getString("datasourceid"))); + ds.setId(createOpenaireId(10, rs.getString("datasourceid"))); ds.setOriginalId(Arrays.asList(rs.getString("datasourceid"))); ds.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); - ds.setPid(null); // List // TODO + ds.setPid(new ArrayList<>()); ds.setDateofcollection(rs.getDate("dateofcollection").toString()); ds.setDateoftransformation(null); // Value not returned by the SQL query - ds.setExtraInfo(null); // TODO + ds.setExtraInfo(new ArrayList<>()); // Values not present in the DB ds.setOaiprovenance(null); // Values not present in the DB ds.setDatasourcetype(prepareQualifierSplitting(rs.getString("datasourcetype"))); ds.setOpenairecompatibility(prepareQualifierSplitting(rs.getString("openairecompatibility"))); @@ -189,13 +188,13 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp final Project p = new Project(); - p.setId(createOpenaireId("40", rs.getString("projectid"))); + p.setId(createOpenaireId(40, rs.getString("projectid"))); p.setOriginalId(Arrays.asList(rs.getString("projectid"))); p.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); - p.setPid(null); // List // TODO + p.setPid(new ArrayList<>()); p.setDateofcollection(rs.getDate("dateofcollection").toString()); p.setDateoftransformation(rs.getDate("dateoftransformation").toString()); - p.setExtraInfo(null); // List //TODO + p.setExtraInfo(new ArrayList<>()); // Values not present in the DB p.setOaiprovenance(null); // Values not present in the DB p.setWebsiteurl(field(rs.getString("websiteurl"), info)); p.setCode(field(rs.getString("code"), info)); @@ -278,13 +277,13 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp final Organization o = new Organization(); - o.setId(createOpenaireId("20", rs.getString("organizationid"))); // String id) { + o.setId(createOpenaireId(20, rs.getString("organizationid"))); o.setOriginalId(Arrays.asList(rs.getString("organizationid"))); o.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); - o.setPid(null); // List // TODO + o.setPid(new ArrayList<>()); o.setDateofcollection(rs.getDate("dateofcollection").toString()); o.setDateoftransformation(rs.getDate("dateoftransformation").toString()); - o.setExtraInfo(null); // List // TODO + o.setExtraInfo(new ArrayList<>()); // Values not present in the DB o.setOaiprovenance(null); // Values not present in the DB o.setLegalshortname(field("legalshortname", info)); o.setLegalname(field("legalname", info)); @@ -342,8 +341,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp try { final DataInfo info = prepareDataInfo(rs); - final String orgId = createOpenaireId("20", rs.getString("organization")); - final String dsId = createOpenaireId("10", rs.getString("datasource")); + final String orgId = createOpenaireId(20, rs.getString("organization")); + final String dsId = createOpenaireId(10, rs.getString("datasource")); final List collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")); final Relation r1 = new Relation(); @@ -390,8 +389,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp public void processProjectOrganization(final ResultSet rs) { try { final DataInfo info = prepareDataInfo(rs); - final String orgId = createOpenaireId("20", rs.getString("resporganization")); - final String projectId = createOpenaireId("40", rs.getString("project")); + final String orgId = createOpenaireId(20, rs.getString("resporganization")); + final String projectId = createOpenaireId(40, rs.getString("project")); final List collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")); final Relation r1 = new Relation(); @@ -451,7 +450,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrateApplication imp return arr.length == 4 ? qualifier(arr[0], arr[1], arr[2], arr[3]) : null; } - private static List> prepareListFields(final Array array, final DataInfo info) { + private List> prepareListFields(final Array array, final DataInfo info) { try { return listFields(info, (String[]) array.getArray()); } catch (final SQLException e) { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java index cead2366b..f6dcaf0e8 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java @@ -2,34 +2,56 @@ package eu.dnetlib.dhp.migration; import java.io.Closeable; import java.io.IOException; -import java.io.StringReader; +import java.sql.SQLException; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Map.Entry; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.dom4j.Document; import org.dom4j.DocumentException; +import org.dom4j.DocumentFactory; +import org.dom4j.DocumentHelper; import org.dom4j.Node; -import org.dom4j.io.SAXReader; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.OAIProvenance; import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; public class MigrateMongoMdstoresApplication extends AbstractMigrateApplication implements Closeable { private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class); + private final Map code2name = new HashMap<>(); + private final MdstoreClient mdstoreClient; + private static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); + + private static final Qualifier PUBLICATION_RESULTTYPE_QUALIFIER = + qualifier("publication", "publication", "dnet:result_typologies", "dnet:result_typologies"); + private static final Qualifier DATASET_RESULTTYPE_QUALIFIER = qualifier("dataset", "dataset", "dnet:result_typologies", "dnet:result_typologies"); + private static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies"); + private static final Qualifier OTHER_RESULTTYPE_QUALIFIER = qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies"); + public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json"))); @@ -46,16 +68,46 @@ public class MigrateMongoMdstoresApplication extends AbstractMigrateApplication final String hdfsNameNode = parser.get("namenode"); final String hdfsUser = parser.get("hdfsUser"); - try (final MigrateMongoMdstoresApplication mig = new MigrateMongoMdstoresApplication(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb)) { + final String dbUrl = parser.get("postgresUrl"); + final String dbUser = parser.get("postgresUser"); + final String dbPassword = parser.get("postgresPassword"); + + try (final MigrateMongoMdstoresApplication mig = + new MigrateMongoMdstoresApplication(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) { mig.processMdRecords(mdFormat, mdLayout, mdInterpretation); } } public MigrateMongoMdstoresApplication(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, - final String mongoDb) throws Exception { + final String mongoDb, final String dbUrl, final String dbUser, + final String dbPassword) throws Exception { super(hdfsPath, hdfsNameNode, hdfsUser); + this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb); + loadClassNames(dbUrl, dbUser, dbPassword); + + final Map nsContext = new HashMap<>(); + nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); + nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); + nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); + nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); + nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); + nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); + DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); + } + + private void loadClassNames(final String dbUrl, final String dbUser, final String dbPassword) throws IOException { + try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) { + code2name.clear(); + dbClient.processResults("select code, name from class", rs -> { + try { + code2name.put(rs.getString("code"), rs.getString("name")); + } catch (final SQLException e) { + e.printStackTrace(); + } + }); + } } @@ -74,23 +126,29 @@ public class MigrateMongoMdstoresApplication extends AbstractMigrateApplication } private List createOafs(final String xml) throws DocumentException { - final SAXReader reader = new SAXReader(); - final Document doc = reader.read(new StringReader(xml)); - final String type = doc.valueOf(""); // TODO + final Document doc = DocumentHelper.parseText(xml); + + final String type = doc.valueOf("//dr:CobjCategory/@type"); + final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name")); + final DataInfo info = prepareDataInfo(doc); + final long lastUpdateTimestamp = new Date().getTime(); final List oafs = new ArrayList<>(); switch (type.toLowerCase()) { + case "": case "publication": final Publication p = new Publication(); - populateResultFields(p, doc); + populateResultFields(p, doc, collectedFrom, info, lastUpdateTimestamp); + p.setResulttype(PUBLICATION_RESULTTYPE_QUALIFIER); p.setJournal(null); // TODO oafs.add(p); break; case "dataset": final Dataset d = new Dataset(); - populateResultFields(d, doc); + populateResultFields(d, doc, collectedFrom, info, lastUpdateTimestamp); + d.setResulttype(DATASET_RESULTTYPE_QUALIFIER); d.setStoragedate(null); // TODO d.setDevice(null); // TODO d.setSize(null); // TODO @@ -101,16 +159,11 @@ public class MigrateMongoMdstoresApplication extends AbstractMigrateApplication oafs.add(d); break; case "otherresearchproducts": - final OtherResearchProduct o = new OtherResearchProduct(); - populateResultFields(o, doc); - o.setContactperson(null); // TODO - o.setContactgroup(null); // TODO - o.setTool(null); // TODO - oafs.add(o); - break; + case "software": final Software s = new Software(); - populateResultFields(s, doc); + populateResultFields(s, doc, collectedFrom, info, lastUpdateTimestamp); + s.setResulttype(SOFTWARE_RESULTTYPE_QUALIFIER); s.setDocumentationUrl(null); // TODO s.setLicense(null); // TODO s.setCodeRepositoryUrl(null); // TODO @@ -118,20 +171,32 @@ public class MigrateMongoMdstoresApplication extends AbstractMigrateApplication oafs.add(s); break; default: - log.error("Inavlid type: " + type); + final OtherResearchProduct o = new OtherResearchProduct(); + populateResultFields(o, doc, collectedFrom, info, lastUpdateTimestamp); + o.setResulttype(OTHER_RESULTTYPE_QUALIFIER); + o.setContactperson(null); // TODO + o.setContactgroup(null); // TODO + o.setTool(null); // TODO + oafs.add(o); break; } if (!oafs.isEmpty()) { - addRelations(oafs, doc, "//*", "TYPE"); - addRelations(oafs, doc, "//*", "TYPE"); - addRelations(oafs, doc, "//*", "TYPE"); + addRelations(oafs, doc, "//*", "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO + addRelations(oafs, doc, "//*", "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO + addRelations(oafs, doc, "//*", "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO } return oafs; } - private void addRelations(final List oafs, final Document doc, final String xpath, final String type) { + private void addRelations(final List oafs, + final Document doc, + final String xpath, + final String type, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { for (final Object o : doc.selectNodes(xpath)) { final Node n = (Node) o; final Relation r = new Relation(); @@ -140,40 +205,42 @@ public class MigrateMongoMdstoresApplication extends AbstractMigrateApplication r.setRelClass(null); // TODO r.setSource(null); // TODO r.setTarget(null); // TODO - r.setCollectedFrom(null); // TODO + r.setCollectedFrom(Arrays.asList(collectedFrom)); + r.setDataInfo(info); + r.setLastupdatetimestamp(lastUpdateTimestamp); oafs.add(r); } } - private void populateResultFields(final Result r, final Document doc) { - r.setDataInfo(null); // TODO - r.setLastupdatetimestamp(null); // TODO - r.setId(null); // TODO - r.setOriginalId(null); // TODO - r.setCollectedfrom(null); // TODO - r.setPid(null); // TODO - r.setDateofcollection(null); // TODO - r.setDateoftransformation(null); // TODO - r.setExtraInfo(null); // TODO - r.setOaiprovenance(null); // TODO + private void populateResultFields(final Result r, final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) { + + r.setDataInfo(info); + r.setLastupdatetimestamp(lastUpdateTimestamp); + r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"))); + r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); + r.setCollectedfrom(Arrays.asList(collectedFrom)); + r.setPid(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); + r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); + r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation")); + r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setOaiprovenance(prepareOAIprovenance(doc)); r.setAuthor(null); // TODO - r.setResulttype(null); // TODO - r.setLanguage(null); // TODO - r.setCountry(null); // TODO - r.setSubject(null); // TODO - r.setTitle(null); // TODO + r.setLanguage(prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages")); + r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setSubject(prepareListStructProps(doc, "//dc:subject", info)); + r.setTitle(prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info)); r.setRelevantdate(null); // TODO - r.setDescription(null); // TODO - r.setDateofacceptance(null); // TODO - r.setPublisher(null); // TODO + r.setDescription(prepareListFields(doc, "//dc:description", info)); + r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info)); + r.setPublisher(prepareField(doc, "//dc:publisher", info)); r.setEmbargoenddate(null); // TODO r.setSource(null); // TODO r.setFulltext(null); // TODO - r.setFormat(null); // TODO - r.setContributor(null); // TODO + r.setFormat(prepareListFields(doc, "//dc:format", info)); + r.setContributor(prepareListFields(doc, "//dc:contributor", info)); r.setResourcetype(null); // TODO - r.setCoverage(null); // TODO + r.setCoverage(prepareListFields(doc, "//dc:coverage", info)); r.setRefereed(null); // TODO r.setContext(null); // TODO r.setExternalReference(null); // TODO @@ -182,9 +249,117 @@ public class MigrateMongoMdstoresApplication extends AbstractMigrateApplication r.setProcessingchargecurrency(null); // TODO } + private Qualifier prepareQualifier(final Document doc, final String xpath, final String schemeId, final String schemeName) { + final String classId = doc.valueOf(xpath); + final String className = code2name.get(classId); + return qualifier(classId, className, schemeId, schemeName); + } + + private List prepareListStructProps(final Document doc, + final String xpath, + final String xpathClassId, + final String schemeId, + final String schemeName, + final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes(xpath)) { + final Node n = (Node) o; + final String classId = n.valueOf(xpathClassId); + final String className = code2name.get(classId); + res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info)); + } + return res; + } + + private List prepareListStructProps(final Document doc, final String xpath, final Qualifier qualifier, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes(xpath)) { + final Node n = (Node) o; + res.add(structuredProperty(n.getText(), qualifier, info)); + } + return res; + } + + private List prepareListStructProps(final Document doc, final String xpath, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes(xpath)) { + final Node n = (Node) o; + res.add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n + .valueOf("@schemename"), info)); + } + return res; + } + + private OAIProvenance prepareOAIprovenance(final Document doc) { + final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); + + final String identifier = n.valueOf("./*[local-name()='identifier']"); + final String baseURL = n.valueOf("./*[local-name()='baseURL']");; + final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']");; + final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true"); + final String datestamp = n.valueOf("./*[local-name()='datestamp']");; + final String harvestDate = n.valueOf("@harvestDate");; + + return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate); + } + + private DataInfo prepareDataInfo(final Document doc) { + final Node n = doc.selectSingleNode("//oaf:datainfo"); + + final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); + final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); + final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid"); + final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename"); + + final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference")); + final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance"); + final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); + final String trust = n.valueOf("./oaf:trust"); + + return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); + } + + private Field prepareField(final Document doc, final String xpath, final DataInfo info) { + return field(doc.valueOf(xpath), info); + } + + private List> prepareListFields(final Document doc, final String xpath, final DataInfo info) { + return listFields(info, (String[]) prepareListString(doc, xpath).toArray()); + } + + private List prepareListString(final Document doc, final String xpath) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes(xpath)) { + final String s = ((Node) o).getText().trim(); + if (StringUtils.isNotBlank(s)) { + res.add(s); + } + } + return res; + } + /* + * private StructuredProperty prepareStructProp(final Document doc, final String xpath, final DataInfo dataInfo) { if + * (StringUtils.isBlank(s)) { return null; } final String[] parts = s.split("###"); if (parts.length == 2) { final String value = + * parts[0]; final String[] arr = parts[1].split("@@@"); if (arr.length == 4) { return structuredProperty(value, arr[0], arr[1], arr[2], + * arr[3], dataInfo); } } return null; } + * + * private List prepareListOfStructProps(final Document doc, final String xpath, final DataInfo dataInfo) { final + * List res = new ArrayList<>(); if (array != null) { for (final String s : (String[]) array.getArray()) { final + * StructuredProperty sp = prepareStructProp(s, dataInfo); if (sp != null) { res.add(sp); } } } + * + * return res; } + * + * private Journal prepareJournal(final Document doc, final String xpath, final DataInfo info) { if (StringUtils.isNotBlank(sj)) { final + * String[] arr = sj.split("@@@"); if (arr.length == 3) { final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0] : null; final + * String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1] : null;; final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2] : null;; + * if (issn != null || eissn != null || lissn != null) { return journal(name, issn, eissn, eissn, null, null, null, null, null, null, + * null, info); } } } return null; } + */ + @Override public void close() throws IOException { super.close(); mdstoreClient.close(); } + } diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json index fb5736dc0..3cd6f39f5 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json @@ -46,5 +46,23 @@ "paramLongName": "mdInterpretation", "paramDescription": "metadata interpretation", "paramRequired": true + }, + { + "paramName": "postgresUrl", + "paramLongName": "postgresUrl", + "paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb", + "paramRequired": true + }, + { + "paramName": "postgresUser", + "paramLongName": "postgresUser", + "paramDescription": "postgres user", + "paramRequired": true + }, + { + "paramName": "postgresPassword", + "paramLongName": "postgresPassword", + "paramDescription": "postgres password", + "paramRequired": true } ] \ No newline at end of file From a55f5fecc63d46e7cc8aa03c7991b08e0cb2966f Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 24 Jan 2020 08:17:53 +0100 Subject: [PATCH 13/45] joining entities using T x R x S method with groupByKey, WIP: making target objects (T) have lower memory footprint --- .../eu/dnetlib/dhp/graph/EntityRelEntity.java | 1 - .../eu/dnetlib/dhp/graph/GraphJoiner.java | 53 +++-- .../eu/dnetlib/dhp/graph/MappingUtils.java | 103 +++++++++ .../eu/dnetlib/dhp/graph/RelatedEntity.java | 210 ++++++++++++++++++ .../main/java/eu/dnetlib/dhp/graph/Tuple.java | 4 +- .../java/eu/dnetlib/dhp/graph/TypedRow.java | 49 ++-- .../dnetlib/dhp/graph/MappingUtilsTest.java | 42 ++++ .../eu/dnetlib/dhp/graph/datasource.json | 1 + .../eu/dnetlib/dhp/graph/result.json | 1 + 9 files changed, 414 insertions(+), 50 deletions(-) create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/MappingUtils.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/RelatedEntity.java create mode 100644 dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/MappingUtilsTest.java create mode 100644 dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/datasource.json create mode 100644 dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/result.json diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityRelEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityRelEntity.java index e8ecc2e30..b0711bbff 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityRelEntity.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityRelEntity.java @@ -51,5 +51,4 @@ public class EntityRelEntity implements Serializable { this.target = target; return this; } - } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java index aca436f52..d8641f272 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java @@ -3,7 +3,9 @@ package eu.dnetlib.dhp.graph; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; +import com.jayway.jsonpath.DocumentContext; import com.jayway.jsonpath.JsonPath; +import eu.dnetlib.dhp.schema.oaf.Qualifier; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.api.java.JavaPairRDD; @@ -24,15 +26,13 @@ public class GraphJoiner implements Serializable { final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String entityIdPath = "$.id"; - - JavaPairRDD datasource = readPathEntity(sc, entityIdPath, inputPath, "datasource"); - JavaPairRDD organization = readPathEntity(sc, entityIdPath, inputPath, "organization"); - JavaPairRDD project = readPathEntity(sc, entityIdPath, inputPath, "project"); - JavaPairRDD dataset = readPathEntity(sc, entityIdPath, inputPath, "dataset"); - JavaPairRDD otherresearchproduct = readPathEntity(sc, entityIdPath, inputPath, "otherresearchproduct"); - JavaPairRDD software = readPathEntity(sc, entityIdPath, inputPath, "software"); - JavaPairRDD publication = readPathEntity(sc, entityIdPath, inputPath, "publication"); + JavaPairRDD datasource = readPathEntity(sc, inputPath, "datasource"); + JavaPairRDD organization = readPathEntity(sc, inputPath, "organization"); + JavaPairRDD project = readPathEntity(sc, inputPath, "project"); + JavaPairRDD dataset = readPathEntity(sc, inputPath, "dataset"); + JavaPairRDD otherresearchproduct = readPathEntity(sc, inputPath, "otherresearchproduct"); + JavaPairRDD software = readPathEntity(sc, inputPath, "software"); + JavaPairRDD publication = readPathEntity(sc, inputPath, "publication"); final String entitiesPath = outPath + "/entities"; datasource @@ -48,28 +48,31 @@ public class GraphJoiner implements Serializable { JavaPairRDD entities = sc.textFile(entitiesPath) .map(t -> new ObjectMapper().readValue(t, EntityRelEntity.class)) - .mapToPair(t -> new Tuple2<>(t.getSource().getSource(), t)); + .mapToPair(t -> new Tuple2<>(t.getSource().getSourceId(), t)); final JavaPairRDD relation = readPathRelation(sc, inputPath) + .filter(r -> !r.getDeleted()) .map(p -> new EntityRelEntity().setRelation(p)) - .mapToPair(p -> new Tuple2<>(p.getRelation().getSource(), p)) + .mapToPair(p -> new Tuple2<>(p.getRelation().getSourceId(), p)) .groupByKey() .map(p -> Iterables.limit(p._2(), MAX_RELS)) .flatMap(p -> p.iterator()) - .mapToPair(p -> new Tuple2<>(p.getRelation().getTarget(), p)); + .mapToPair(p -> new Tuple2<>(p.getRelation().getTargetId(), p)); final String joinByTargetPath = outPath + "/join_by_target"; - relation.join(entities) + relation + .join(entities + .filter(e -> !e._2().getSource().getDeleted()) + /*.mapToPair(e -> new Tuple2<>(e._1(), new MappingUtils().pruneModel(e._2())))*/) .map(s -> new EntityRelEntity() .setRelation(s._2()._1().getRelation()) .setTarget(s._2()._2().getSource())) .map(e -> new ObjectMapper().writeValueAsString(e)) .saveAsTextFile(joinByTargetPath, GzipCodec.class); - JavaPairRDD bySource = sc.textFile(joinByTargetPath) .map(t -> new ObjectMapper().readValue(t, EntityRelEntity.class)) - .mapToPair(t -> new Tuple2<>(t.getRelation().getSource(), t)); + .mapToPair(t -> new Tuple2<>(t.getRelation().getSourceId(), t)); entities .union(bySource) @@ -97,12 +100,17 @@ public class GraphJoiner implements Serializable { .saveAsTextFile(outPath + "/linked_entities", GzipCodec.class); } - private JavaPairRDD readPathEntity(final JavaSparkContext sc, final String idPath, final String inputPath, final String type) { + private JavaPairRDD readPathEntity(final JavaSparkContext sc, final String inputPath, final String type) { return sc.sequenceFile(inputPath + "/" + type, Text.class, Text.class) .mapToPair((PairFunction, String, TypedRow>) item -> { + final String json = item._2().toString(); - final String id = JsonPath.read(json, idPath); - return new Tuple2<>(id, new TypedRow(id, type, json)); + final String id = JsonPath.read(json, "$.id"); + return new Tuple2<>(id, new TypedRow() + .setSourceId(id) + .setDeleted(JsonPath.read(json, "$.dataInfo.deletedbyinference")) + .setType(type) + .setOaf(json)); }); } @@ -110,9 +118,12 @@ public class GraphJoiner implements Serializable { return sc.sequenceFile(inputPath + "/relation", Text.class, Text.class) .map(item -> { final String json = item._2().toString(); - final String source = JsonPath.read(json, "$.source"); - final String target = JsonPath.read(json, "$.target"); - return new TypedRow(source, target, "relation", json); + return new TypedRow() + .setSourceId(JsonPath.read(json, "$.source")) + .setTargetId(JsonPath.read(json, "$.target")) + .setDeleted(JsonPath.read(json, "$.dataInfo.deletedbyinference")) + .setType("relation") + .setOaf(json); }); } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/MappingUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/MappingUtils.java new file mode 100644 index 000000000..756506c12 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/MappingUtils.java @@ -0,0 +1,103 @@ +package eu.dnetlib.dhp.graph; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.jayway.jsonpath.DocumentContext; +import com.jayway.jsonpath.JsonPath; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import net.minidev.json.JSONArray; + +import java.util.LinkedHashMap; +import java.util.stream.Collectors; + +public class MappingUtils { + + public EntityRelEntity pruneModel(EntityRelEntity e) throws JsonProcessingException { + + final DocumentContext j = JsonPath.parse(e.getSource().getOaf()); + final RelatedEntity re = new RelatedEntity(); + + switch (e.getSource().getType()) { + case "publication": + case "dataset": + case "otherresearchproduct": + case "software": + + mapTitle(j, re); + re.setDateofacceptance(j.read("$.dateofacceptance.value")); + re.setPublisher(j.read("$.publisher.value")); + + JSONArray pids = j.read("$.pid"); + re.setPid(pids.stream() + .map(p -> asStructuredProperty((LinkedHashMap) p)) + .collect(Collectors.toList())); + + re.setResulttype(asQualifier(j.read("$.resulttype"))); + + JSONArray collfrom = j.read("$.collectedfrom"); + re.setCollectedfrom(collfrom.stream() + .map(c -> asKV((LinkedHashMap)c)) + .collect(Collectors.toList())); + + //TODO still to be mapped + //re.setCodeRepositoryUrl(j.read("$.coderepositoryurl")); + + break; + case "datasource": + re.setOfficialname(j.read("$.officialname.value")); + re.setWebsiteurl(j.read("$.websiteurl.value")); + + re.setDatasourcetype(asQualifier(j.read("$.datasourcetype"))); + re.setOpenairecompatibility(asQualifier(j.read("$.openairecompatibility"))); + + break; + case "organization": + + break; + case "project": + mapTitle(j, re); + break; + } + + return new EntityRelEntity().setSource( + new TypedRow() + .setSourceId(e.getSource().getSourceId()) + .setDeleted(e.getSource().getDeleted()) + .setType(e.getSource().getType()) + .setOaf(new ObjectMapper().writeValueAsString(re))); + } + + private KeyValue asKV(LinkedHashMap j) { + final KeyValue kv = new KeyValue(); + kv.setKey((String) j.get("key")); + kv.setValue((String) j.get("value")); + return kv; + } + + private void mapTitle(DocumentContext j, RelatedEntity re) { + JSONArray a = j.read("$.title"); + if (!a.isEmpty()) { + re.setTitle(asStructuredProperty((LinkedHashMap) a.get(0))); + } + } + + private StructuredProperty asStructuredProperty(LinkedHashMap j) { + final StructuredProperty sp = new StructuredProperty(); + sp.setValue((String) j.get("value")); + sp.setQualifier(asQualifier((LinkedHashMap) j.get("qualifier"))); + return sp; + + } + + public Qualifier asQualifier(LinkedHashMap j) { + Qualifier q = new Qualifier(); + q.setClassid(j.get("classid")); + q.setClassname(j.get("classname")); + q.setSchemeid(j.get("schemeid")); + q.setSchemename(j.get("schemename")); + return q; + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/RelatedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/RelatedEntity.java new file mode 100644 index 000000000..a441392b2 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/RelatedEntity.java @@ -0,0 +1,210 @@ +package eu.dnetlib.dhp.graph; + +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +import java.io.Serializable; +import java.util.List; + +public class RelatedEntity implements Serializable { + + // results + private StructuredProperty title; // also for projects + private String dateofacceptance; + private String publisher; + private List pid; + private String codeRepositoryUrl; + private Qualifier resulttype; + private List collectedfrom; + + // datasource + private String officialname; + private String websiteurl; // also for organizations, projects + private Qualifier datasourcetype; + private Qualifier datasourcetypeui; + //private String aggregatortype; + private Qualifier openairecompatibility; + + // organization + private String legalname; + private String legalshortname; + private Qualifier country; + + // project + private String code; + private String acronym; + private Qualifier contracttype; + private String fundingtree; + + public StructuredProperty getTitle() { + return title; + } + + public RelatedEntity setTitle(StructuredProperty title) { + this.title = title; + return this; + } + + public String getDateofacceptance() { + return dateofacceptance; + } + + public RelatedEntity setDateofacceptance(String dateofacceptance) { + this.dateofacceptance = dateofacceptance; + return this; + } + + public String getPublisher() { + return publisher; + } + + public RelatedEntity setPublisher(String publisher) { + this.publisher = publisher; + return this; + } + + public List getPid() { + return pid; + } + + public RelatedEntity setPid(List pid) { + this.pid = pid; + return this; + } + + public String getCodeRepositoryUrl() { + return codeRepositoryUrl; + } + + public RelatedEntity setCodeRepositoryUrl(String codeRepositoryUrl) { + this.codeRepositoryUrl = codeRepositoryUrl; + return this; + } + + public Qualifier getResulttype() { + return resulttype; + } + + public RelatedEntity setResulttype(Qualifier resulttype) { + this.resulttype = resulttype; + return this; + } + + public List getCollectedfrom() { + return collectedfrom; + } + + public RelatedEntity setCollectedfrom(List collectedfrom) { + this.collectedfrom = collectedfrom; + return this; + } + + public String getOfficialname() { + return officialname; + } + + public RelatedEntity setOfficialname(String officialname) { + this.officialname = officialname; + return this; + } + + public String getWebsiteurl() { + return websiteurl; + } + + public RelatedEntity setWebsiteurl(String websiteurl) { + this.websiteurl = websiteurl; + return this; + } + + public Qualifier getDatasourcetype() { + return datasourcetype; + } + + public RelatedEntity setDatasourcetype(Qualifier datasourcetype) { + this.datasourcetype = datasourcetype; + return this; + } + + public Qualifier getDatasourcetypeui() { + return datasourcetypeui; + } + + public RelatedEntity setDatasourcetypeui(Qualifier datasourcetypeui) { + this.datasourcetypeui = datasourcetypeui; + return this; + } + + public Qualifier getOpenairecompatibility() { + return openairecompatibility; + } + + public RelatedEntity setOpenairecompatibility(Qualifier openairecompatibility) { + this.openairecompatibility = openairecompatibility; + return this; + } + + public String getLegalname() { + return legalname; + } + + public RelatedEntity setLegalname(String legalname) { + this.legalname = legalname; + return this; + } + + public String getLegalshortname() { + return legalshortname; + } + + public RelatedEntity setLegalshortname(String legalshortname) { + this.legalshortname = legalshortname; + return this; + } + + public Qualifier getCountry() { + return country; + } + + public RelatedEntity setCountry(Qualifier country) { + this.country = country; + return this; + } + + public String getCode() { + return code; + } + + public RelatedEntity setCode(String code) { + this.code = code; + return this; + } + + public String getAcronym() { + return acronym; + } + + public RelatedEntity setAcronym(String acronym) { + this.acronym = acronym; + return this; + } + + public Qualifier getContracttype() { + return contracttype; + } + + public RelatedEntity setContracttype(Qualifier contracttype) { + this.contracttype = contracttype; + return this; + } + + public String getFundingtree() { + return fundingtree; + } + + public RelatedEntity setFundingtree(String fundingtree) { + this.fundingtree = fundingtree; + return this; + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Tuple.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Tuple.java index 0b22a63a5..1eb0491a7 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Tuple.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Tuple.java @@ -1,8 +1,5 @@ package eu.dnetlib.dhp.graph; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.Relation; - import java.io.Serializable; public class Tuple implements Serializable { @@ -11,6 +8,7 @@ public class Tuple implements Serializable { private TypedRow target; + public TypedRow getRelation() { return relation; } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TypedRow.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TypedRow.java index 60c3b64b2..1acbbce93 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TypedRow.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TypedRow.java @@ -4,40 +4,40 @@ import java.io.Serializable; public class TypedRow implements Serializable { - private String source; - private String target; + private String sourceId; + + private String targetId; + + private Boolean deleted; + private String type; + private String oaf; - public TypedRow() { + public String getSourceId() { + return sourceId; } - public TypedRow(String source, String type, String oaf) { - this.source = source; - this.type = type; - this.oaf = oaf; - } - - public TypedRow(String source, String target, String type, String oaf) { - this(source, type, oaf); - this.target = target; - } - - public String getSource() { - return source; - } - - public TypedRow setSource(String source) { - this.source = source; + public TypedRow setSourceId(String sourceId) { + this.sourceId = sourceId; return this; } - public String getTarget() { - return target; + public String getTargetId() { + return targetId; } - public TypedRow setTarget(String target) { - this.target = target; + public TypedRow setTargetId(String targetId) { + this.targetId = targetId; + return this; + } + + public Boolean getDeleted() { + return deleted; + } + + public TypedRow setDeleted(Boolean deleted) { + this.deleted = deleted; return this; } @@ -58,5 +58,4 @@ public class TypedRow implements Serializable { this.oaf = oaf; return this; } - } diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/MappingUtilsTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/MappingUtilsTest.java new file mode 100644 index 000000000..2edb0aa70 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/MappingUtilsTest.java @@ -0,0 +1,42 @@ +package eu.dnetlib.dhp.graph; + +import org.codehaus.jackson.map.ObjectMapper; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; +import java.io.InputStreamReader; + +public class MappingUtilsTest { + + private MappingUtils utils; + + @Before + public void setUp() { + utils = new MappingUtils(); + } + + @Test + public void testOafMappingDatasource() throws IOException { + + final InputStreamReader in = new InputStreamReader(getClass().getResourceAsStream("datasource.json")); + final EntityRelEntity e = new ObjectMapper().readValue(in, EntityRelEntity.class); + e.getSource().setType("datasource"); + + final EntityRelEntity out = utils.pruneModel(e); + System.out.println(out); + + } + + @Test + public void testOafMappinResult() throws IOException { + + final InputStreamReader in = new InputStreamReader(getClass().getResourceAsStream("result.json")); + final EntityRelEntity e = new ObjectMapper().readValue(in, EntityRelEntity.class); + e.getSource().setType("otherresearchproduct"); + + final EntityRelEntity out = utils.pruneModel(e); + System.out.println(out); + + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/datasource.json b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/datasource.json new file mode 100644 index 000000000..c26154c1e --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/datasource.json @@ -0,0 +1 @@ +{"source":{"sourceId":"10|CRIS_UNS____::f66f1bd369679b5b077dcdf006089556","targetId":null,"deleted":false,"oaf":"{\"datasourcetype\":{\"classid\":\"crissystem\",\"classname\":\"CRIS System\",\"schemeid\":\"dnet:datasource_typologies\",\"schemename\":\"dnet:datasource_typologies\"},\"openairecompatibility\":{\"classid\":\"openaire-cris_1.1\",\"classname\":\"OpenAIRE CRIS v1.1\",\"schemeid\":\"dnet:datasourceCompatibilityLevel\",\"schemename\":\"dnet:datasourceCompatibilityLevel\"},\"officialname\":{\"value\":\"CRIS UNS (Current Research Information System University of Novi Sad)\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"englishname\":{\"value\":\"CRIS UNS (Current Research Information System University of Novi Sad)\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"websiteurl\":{\"value\":\"https://cris.uns.ac.rs/\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"logourl\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"contactemail\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"namespaceprefix\":{\"value\":\"CrisUnsNoviS\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"latitude\":{\"value\":\"0.0\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"longitude\":{\"value\":\"0.0\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"dateofvalidation\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"description\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"subjects\":[],\"odnumberofitems\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"odnumberofitemsdate\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"odpolicies\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"odlanguages\":[],\"odcontenttypes\":[],\"accessinfopackage\":[{\"value\":\"https://cris.uns.ac.rs/OAIHandlerOpenAIRECRIS\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}}],\"releasestartdate\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"releaseenddate\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"missionstatementurl\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"dataprovider\":{\"value\":false,\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"serviceprovider\":{\"value\":false,\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"databaseaccesstype\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"datauploadtype\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"databaseaccessrestriction\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"datauploadrestriction\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"versioning\":{\"value\":false,\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"citationguidelineurl\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"qualitymanagementkind\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"pidsystems\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"certificates\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"policies\":[],\"journal\":{\"name\":\"\",\"issnPrinted\":\"\",\"issnOnline\":\"\",\"issnLinking\":\"\",\"ep\":\"\",\"iss\":\"\",\"sp\":\"\",\"vol\":\"\",\"edition\":\"\",\"conferenceplace\":\"\",\"conferencedate\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"id\":\"10|CRIS_UNS____::f66f1bd369679b5b077dcdf006089556\",\"originalId\":[\"CRIS_UNS____::openaire\"],\"collectedfrom\":[{\"key\":\"\",\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}}],\"pid\":[],\"extraInfo\":[],\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:entityregistry\",\"classname\":\"sysimport:crosswalk:entityregistry\",\"schemeid\":\"dnet:provenance_actions\",\"schemename\":\"dnet:provenance_actions\"}},\"lastupdatetimestamp\":0}"},"relation":null,"target":null} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/result.json b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/result.json new file mode 100644 index 000000000..5d6c3f29b --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/result.json @@ -0,0 +1 @@ +{"source":{"sourceId":"50|od_______165::5642f79c597bac8029fde12a80f75412","targetId":null,"deleted":true,"type":"otherresearchproduct","oaf":"{\"contactperson\":[],\"contactgroup\":[],\"tool\":[],\"author\":[{\"fullname\":\"Cartier, Adrien\",\"name\":\"Adrien\",\"surname\":\"Cartier\",\"rank\":1,\"pid\":[],\"affiliation\":[]},{\"fullname\":\"Larroudé, Philippe\",\"name\":\"Philippe\",\"surname\":\"Larroudé\",\"rank\":2,\"pid\":[],\"affiliation\":[]},{\"fullname\":\"Héquette, Arnaud\",\"name\":\"Arnaud\",\"surname\":\"Héquette\",\"rank\":3,\"pid\":[],\"affiliation\":[]}],\"resulttype\":{\"classid\":\"other\",\"classname\":\"other\",\"schemeid\":\"dnet:result_typologies\",\"schemename\":\"dnet:result_typologies\"},\"language\":{\"classid\":\"eng\",\"classname\":\"English\",\"schemeid\":\"dnet:languages\",\"schemename\":\"dnet:languages\"},\"country\":[],\"subject\":[{\"value\":\"[SDU.STU.OC] Sciences of the Universe/Earth Sciences/Oceanography\",\"qualifier\":{\"classid\":\"keyword\",\"classname\":\"keyword\",\"schemeid\":\"dnet:subject_classification_typologies\",\"schemename\":\"dnet:subject_classification_typologies\"},\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}}],\"title\":[{\"value\":\"Longshore Sediment Transport Measurements on Sandy Macrotidal Beaches Compared with Sediment Transport Formulae\",\"qualifier\":{\"classid\":\"main title\",\"classname\":\"main title\",\"schemeid\":\"dnet:dataCite_title\",\"schemename\":\"dnet:dataCite_title\"},\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}}],\"relevantdate\":[],\"description\":[{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}}],\"dateofacceptance\":{\"value\":\"2013-03-13\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"publisher\":{\"value\":\"intech\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"embargoenddate\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"source\":[{\"value\":\"Sediment Transport Processes and Their Modelling Applications\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},{\"value\":\"https://hal.archives-ouvertes.fr/hal-00824453\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},{\"value\":\"Sediment Transport Processes and Their Modelling Applications, intech, chapitre 2, 2013, 978-953-51-1039-2,. \\u0026lt;10.5772/51023\\u0026gt;\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}}],\"fulltext\":[],\"format\":[],\"contributor\":[{\"value\":\"Equipe Morphodynamique des littoraux (Dunkerque) ; Laboratoire d\\u0027Océanologie et de Géosciences (LOG) ; Université du Littoral Côte d\\u0027Opale - Université Lille I - Sciences et technologies - CNRS - Université du Littoral Côte d\\u0027Opale - Université Lille I - Sciences et technologies - CNRS\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},{\"value\":\"Laboratoire des écoulements géophysiques et industriels (LEGI) ; Université Joseph Fourier - Grenoble I - Institut polytechnique de Grenoble (Grenoble INP) - CNRS\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},{\"value\":\"Laboratoire d\\u0027Océanologie et de Géosciences (LOG) ; Université du Littoral Côte d\\u0027Opale - Université Lille I - Sciences et technologies - CNRS\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}}],\"resourcetype\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"},\"coverage\":[],\"refereed\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"context\":[],\"id\":\"50|od_______165::5642f79c597bac8029fde12a80f75412\",\"originalId\":[\"oai:HAL:hal-00824453v1\"],\"collectedfrom\":[{\"key\":\"10|opendoar____::9766527f2b5d3e95d4a733fcfb77bd7e\",\"value\":\"INRIA a CCSD electronic archive server\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}}],\"pid\":[{\"value\":\"10.5772/51023\",\"qualifier\":{\"classid\":\"doi\",\"classname\":\"doi\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}}],\"extraInfo\":[],\"dataInfo\":{\"invisible\":false,\"inferred\":true,\"deletedbyinference\":true,\"inferenceprovenance\":\"dedup-similarity-result-levenstein\",\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"sysimport:crosswalk:repository\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"}},\"lastupdatetimestamp\":0}"},"relation":null,"target":null} \ No newline at end of file From fcbc4ccd70b7edfdb8c041dc9dbdaed5943fa13a Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 24 Jan 2020 08:43:23 +0100 Subject: [PATCH 14/45] a bit of docs doesn't hurt --- .../eu/dnetlib/dhp/graph/GraphJoiner.java | 40 ++++++++++++++++++- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java index d8641f272..96d1f150a 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java @@ -18,6 +18,24 @@ import scala.Tuple2; import java.io.Serializable; import java.util.List; +/** + * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. + * The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, + * and all the possible relationships (similarity links produced by the Dedup process are excluded). + * + * The operation is implemented creating the union between the entity types (E), joined by the relationships (R), and again + * by E, finally grouped by E.id; + * + * Different manipulations of the E and R sets are introduced to reduce the complexity of the operation + * 1) treat the object payload as string, extracting only the necessary information beforehand using json path, + * it seems that deserializing it with jackson's object mapper has higher memory footprint. + * + * 2) only consider rels that are not virtually deleted ($.dataInfo.deletedbyinference == false) + * 3) we only need a subset of fields from the related entities, so we introduce a distinction between E_source = S + * and E_target = T. Objects in T are heavily pruned by all the unnecessary information + * + * 4) perform the join as (((T join R) union S) groupby S.id) yield S -> [ ] + */ public class GraphJoiner implements Serializable { public static final int MAX_RELS = 10; @@ -26,6 +44,7 @@ public class GraphJoiner implements Serializable { final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + // read each entity JavaPairRDD datasource = readPathEntity(sc, inputPath, "datasource"); JavaPairRDD organization = readPathEntity(sc, inputPath, "organization"); JavaPairRDD project = readPathEntity(sc, inputPath, "project"); @@ -34,6 +53,7 @@ public class GraphJoiner implements Serializable { JavaPairRDD software = readPathEntity(sc, inputPath, "software"); JavaPairRDD publication = readPathEntity(sc, inputPath, "publication"); + // create the union between all the entities final String entitiesPath = outPath + "/entities"; datasource .union(organization) @@ -50,8 +70,9 @@ public class GraphJoiner implements Serializable { .map(t -> new ObjectMapper().readValue(t, EntityRelEntity.class)) .mapToPair(t -> new Tuple2<>(t.getSource().getSourceId(), t)); + // reads the relationships final JavaPairRDD relation = readPathRelation(sc, inputPath) - .filter(r -> !r.getDeleted()) + .filter(r -> !r.getDeleted()) //only consider those that are not virtually deleted .map(p -> new EntityRelEntity().setRelation(p)) .mapToPair(p -> new Tuple2<>(p.getRelation().getSourceId(), p)) .groupByKey() @@ -98,8 +119,16 @@ public class GraphJoiner implements Serializable { }) .map(e -> new ObjectMapper().writeValueAsString(e)) .saveAsTextFile(outPath + "/linked_entities", GzipCodec.class); - } + } + /** + * Reads a set of eu.dnetlib.dhp.schema.oaf.OafEntity objects from a sequence file , + * extracts necessary information using json path, wraps the oaf object in a eu.dnetlib.dhp.graph.TypedRow + * @param sc + * @param inputPath + * @param type + * @return the JavaPairRDD indexed by entity identifier + */ private JavaPairRDD readPathEntity(final JavaSparkContext sc, final String inputPath, final String type) { return sc.sequenceFile(inputPath + "/" + type, Text.class, Text.class) .mapToPair((PairFunction, String, TypedRow>) item -> { @@ -114,6 +143,13 @@ public class GraphJoiner implements Serializable { }); } + /** + * Reads a set of eu.dnetlib.dhp.schema.oaf.Relation objects from a sequence file , + * extracts necessary information using json path, wraps the oaf object in a eu.dnetlib.dhp.graph.TypedRow + * @param sc + * @param inputPath + * @return the JavaRDD containing all the relationships + */ private JavaRDD readPathRelation(final JavaSparkContext sc, final String inputPath) { return sc.sequenceFile(inputPath + "/relation", Text.class, Text.class) .map(item -> { From 0dff14b28e3e08153bd321bf993f30f08afae972 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Mon, 27 Jan 2020 10:53:54 +0100 Subject: [PATCH 15/45] added property to gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 3f00d9729..4feeb36c3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ .DS_Store .idea *.iml +*.ipr +*.iws *~ .classpath /*/.classpath From 8c2aff99b01b555581b273640d4c7198ee6eaac8 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 29 Jan 2020 15:40:33 +0100 Subject: [PATCH 16/45] joining entities using T x R x S, WIP: last representation based on LinkedEntity type --- .../eu/dnetlib/dhp/graph/GraphJoiner.java | 71 +++++++++-------- .../eu/dnetlib/dhp/graph/GraphMapper.java | 77 +++++++++++++++++++ .../main/java/eu/dnetlib/dhp/graph/Link.java | 30 ++++++++ .../eu/dnetlib/dhp/graph/LinkedEntity.java | 25 ++++-- .../dhp/graph/LinkedEntityWrapper.java | 40 ++++++++++ .../eu/dnetlib/dhp/graph/MappingUtils.java | 49 ++++++++---- .../eu/dnetlib/dhp/graph/RelatedEntity.java | 55 +++++++++++-- .../dhp/graph/SparkGraphIndexingJob.java | 1 + .../graph/{Tuple.java => TupleWrapper.java} | 6 +- .../dnetlib/dhp/graph/MappingUtilsTest.java | 10 +++ .../eu/dnetlib/dhp/graph/related_entity.json | 5 ++ 11 files changed, 309 insertions(+), 60 deletions(-) create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphMapper.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Link.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntityWrapper.java rename dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/{Tuple.java => TupleWrapper.java} (70%) create mode 100644 dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/related_entity.json diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java index 96d1f150a..110649522 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java @@ -1,11 +1,14 @@ package eu.dnetlib.dhp.graph; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.jayway.jsonpath.DocumentContext; import com.jayway.jsonpath.JsonPath; -import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.*; +import net.minidev.json.JSONArray; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.api.java.JavaPairRDD; @@ -15,8 +18,10 @@ import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.SparkSession; import scala.Tuple2; +import java.io.IOException; import java.io.Serializable; import java.util.List; +import java.util.stream.Collectors; /** * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. @@ -38,7 +43,7 @@ import java.util.List; */ public class GraphJoiner implements Serializable { - public static final int MAX_RELS = 10; + public static final int MAX_RELS = 100; public void join(final SparkSession spark, final String inputPath, final String hiveDbName, final String outPath) { @@ -63,7 +68,7 @@ public class GraphJoiner implements Serializable { .union(software) .union(publication) .map(e -> new EntityRelEntity().setSource(e._2())) - .map(e -> new ObjectMapper().writeValueAsString(e)) + .map(MappingUtils::serialize) .saveAsTextFile(entitiesPath, GzipCodec.class); JavaPairRDD entities = sc.textFile(entitiesPath) @@ -84,41 +89,24 @@ public class GraphJoiner implements Serializable { relation .join(entities .filter(e -> !e._2().getSource().getDeleted()) - /*.mapToPair(e -> new Tuple2<>(e._1(), new MappingUtils().pruneModel(e._2())))*/) + .mapToPair(e -> new Tuple2<>(e._1(), MappingUtils.pruneModel(e._2())))) .map(s -> new EntityRelEntity() .setRelation(s._2()._1().getRelation()) .setTarget(s._2()._2().getSource())) - .map(e -> new ObjectMapper().writeValueAsString(e)) + .map(MappingUtils::serialize) .saveAsTextFile(joinByTargetPath, GzipCodec.class); JavaPairRDD bySource = sc.textFile(joinByTargetPath) .map(t -> new ObjectMapper().readValue(t, EntityRelEntity.class)) .mapToPair(t -> new Tuple2<>(t.getRelation().getSourceId(), t)); + final String linkedEntitiesPath = outPath + "/linked_entities"; entities .union(bySource) .groupByKey() // by source id - .map(p -> { - final LinkedEntity e = new LinkedEntity(); - final List links = Lists.newArrayList(); - for(EntityRelEntity rel : p._2()) { - if (rel.hasMainEntity() & e.getEntity() == null) { - e.setEntity(rel.getSource()); - } - if (rel.hasRelatedEntity()) { - links.add(new Tuple() - .setRelation(rel.getRelation()) - .setTarget(rel.getTarget())); - } - } - e.setLinks(links); - if (e.getEntity() == null) { - throw new IllegalStateException("missing main entity on '" + p._1() + "'"); - } - return e; - }) - .map(e -> new ObjectMapper().writeValueAsString(e)) - .saveAsTextFile(outPath + "/linked_entities", GzipCodec.class); + .map(GraphJoiner::asLinkedEntityWrapper) + .map(MappingUtils::serialize) + .saveAsTextFile(linkedEntitiesPath, GzipCodec.class); } /** @@ -153,14 +141,35 @@ public class GraphJoiner implements Serializable { private JavaRDD readPathRelation(final JavaSparkContext sc, final String inputPath) { return sc.sequenceFile(inputPath + "/relation", Text.class, Text.class) .map(item -> { - final String json = item._2().toString(); + final String s = item._2().toString(); + final DocumentContext json = JsonPath.parse(s); return new TypedRow() - .setSourceId(JsonPath.read(json, "$.source")) - .setTargetId(JsonPath.read(json, "$.target")) - .setDeleted(JsonPath.read(json, "$.dataInfo.deletedbyinference")) + .setSourceId(json.read("$.source")) + .setTargetId(json.read("$.target")) + .setDeleted(json.read("$.dataInfo.deletedbyinference")) .setType("relation") - .setOaf(json); + .setOaf(s); }); } + private static LinkedEntityWrapper asLinkedEntityWrapper(Tuple2> p) { + final LinkedEntityWrapper e = new LinkedEntityWrapper(); + final List links = Lists.newArrayList(); + for (EntityRelEntity rel : p._2()) { + if (rel.hasMainEntity() & e.getEntity() == null) { + e.setEntity(rel.getSource()); + } + if (rel.hasRelatedEntity()) { + links.add(new TupleWrapper() + .setRelation(rel.getRelation()) + .setTarget(rel.getTarget())); + } + } + e.setLinks(links); + if (e.getEntity() == null) { + throw new IllegalStateException("missing main entity on '" + p._1() + "'"); + } + return e; + } + } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphMapper.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphMapper.java new file mode 100644 index 000000000..bdfea7979 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphMapper.java @@ -0,0 +1,77 @@ +package eu.dnetlib.dhp.graph; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.schema.oaf.*; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; + +import java.io.IOException; +import java.util.stream.Collectors; + +public class GraphMapper { + + + public void map(final SparkSession spark, final String outPath) { + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + sc.textFile(outPath + "/linked_entities") + .map(LinkedEntityWrapper::parse) + .map(GraphMapper::asLinkedEntity) + .map(e -> new ObjectMapper().writeValueAsString(e)) + .saveAsTextFile(outPath + "/linked_entities_types"); + } + + private static LinkedEntity asLinkedEntity(final LinkedEntityWrapper lw) throws JsonProcessingException { + final LinkedEntity le = new LinkedEntity(); + + try { + le.setType(lw.getEntity().getType()); + le.setEntity(parseEntity(lw.getEntity().getOaf(), le.getType())); + le.setLinks(lw.getLinks() + .stream() + .map(l -> new Link() + .setRelation(parseRelation(l.getRelation().getOaf())) + .setRelatedEntity(RelatedEntity.parse(l.getTarget().getOaf()))) + .collect(Collectors.toList())); + return le; + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException(new ObjectMapper().writeValueAsString(lw), e); + } + } + + private static Relation parseRelation(final String s) { + try { + return new ObjectMapper().readValue(s, Relation.class); + } catch (IOException e) { + throw new IllegalArgumentException("unable to decode Relation: " + s); + } + } + + private static OafEntity parseEntity(final String json, final String type) { + final ObjectMapper o = new ObjectMapper(); + try { + switch (type) { + case "publication": + return o.readValue(json, Publication.class); + case "dataset": + return o.readValue(json, Dataset.class); + case "otherresearchproduct": + return o.readValue(json, OtherResearchProduct.class); + case "software": + return o.readValue(json, Software.class); + case "datasource": + return o.readValue(json, Datasource.class); + case "project": + return o.readValue(json, Project.class); + case "organization": + return o.readValue(json, Organization.class); + default: + throw new IllegalArgumentException("invalid entity type: " + type); + } + } catch (IOException e) { + throw new IllegalArgumentException("unable to decode oaf entity: " + json); + } + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Link.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Link.java new file mode 100644 index 000000000..8426fbd12 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Link.java @@ -0,0 +1,30 @@ +package eu.dnetlib.dhp.graph; + +import eu.dnetlib.dhp.schema.oaf.Relation; + +import java.io.Serializable; + +public class Link implements Serializable { + + private Relation relation; + + private RelatedEntity relatedEntity; + + public Relation getRelation() { + return relation; + } + + public Link setRelation(Relation relation) { + this.relation = relation; + return this; + } + + public RelatedEntity getRelatedEntity() { + return relatedEntity; + } + + public Link setRelatedEntity(RelatedEntity relatedEntity) { + this.relatedEntity = relatedEntity; + return this; + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntity.java index 9e6fc0d38..c7c2d1892 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntity.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntity.java @@ -1,28 +1,41 @@ package eu.dnetlib.dhp.graph; +import eu.dnetlib.dhp.schema.oaf.OafEntity; + import java.io.Serializable; import java.util.List; public class LinkedEntity implements Serializable { - private TypedRow entity; + private String type; - private List links; + private OafEntity entity; - public TypedRow getEntity() { + private List links; + + public String getType() { + return type; + } + + public LinkedEntity setType(String type) { + this.type = type; + return this; + } + + public OafEntity getEntity() { return entity; } - public LinkedEntity setEntity(TypedRow entity) { + public LinkedEntity setEntity(OafEntity entity) { this.entity = entity; return this; } - public List getLinks() { + public List getLinks() { return links; } - public LinkedEntity setLinks(List links) { + public LinkedEntity setLinks(List links) { this.links = links; return this; } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntityWrapper.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntityWrapper.java new file mode 100644 index 000000000..17853208c --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntityWrapper.java @@ -0,0 +1,40 @@ +package eu.dnetlib.dhp.graph; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import java.io.IOException; +import java.io.Serializable; +import java.util.List; + +public class LinkedEntityWrapper implements Serializable { + + private TypedRow entity; + + private List links; + + public static LinkedEntityWrapper parse(final String s) { + try { + return new ObjectMapper().readValue(s, LinkedEntityWrapper.class); + } catch (IOException e) { + throw new IllegalArgumentException("unable to decode LinkedEntityWrapper: " + s); + } + } + + public TypedRow getEntity() { + return entity; + } + + public LinkedEntityWrapper setEntity(TypedRow entity) { + this.entity = entity; + return this; + } + + public List getLinks() { + return links; + } + + public LinkedEntityWrapper setLinks(List links) { + this.links = links; + return this; + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/MappingUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/MappingUtils.java index 756506c12..9f7ca4d0b 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/MappingUtils.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/MappingUtils.java @@ -1,5 +1,6 @@ package eu.dnetlib.dhp.graph; +import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.jayway.jsonpath.DocumentContext; @@ -14,17 +15,16 @@ import java.util.stream.Collectors; public class MappingUtils { - public EntityRelEntity pruneModel(EntityRelEntity e) throws JsonProcessingException { + public static EntityRelEntity pruneModel(EntityRelEntity e) { final DocumentContext j = JsonPath.parse(e.getSource().getOaf()); - final RelatedEntity re = new RelatedEntity(); + final RelatedEntity re = new RelatedEntity().setId(j.read("$.id")).setType(e.getSource().getType()); switch (e.getSource().getType()) { case "publication": case "dataset": case "otherresearchproduct": case "software": - mapTitle(j, re); re.setDateofacceptance(j.read("$.dateofacceptance.value")); re.setPublisher(j.read("$.publisher.value")); @@ -48,51 +48,62 @@ public class MappingUtils { case "datasource": re.setOfficialname(j.read("$.officialname.value")); re.setWebsiteurl(j.read("$.websiteurl.value")); - re.setDatasourcetype(asQualifier(j.read("$.datasourcetype"))); re.setOpenairecompatibility(asQualifier(j.read("$.openairecompatibility"))); break; case "organization": + re.setLegalname(j.read("$.legalname.value")); + re.setLegalshortname(j.read("$.legalshortname.value")); + re.setCountry(asQualifier(j.read("$.country"))); break; case "project": - mapTitle(j, re); + re.setProjectTitle(j.read("$.title.value")); + re.setCode(j.read("$.code.value")); + re.setAcronym(j.read("$.acronym.value")); + re.setContracttype(asQualifier(j.read("$.contracttype"))); + + JSONArray f = j.read("$.fundingtree"); + if (!f.isEmpty()) { + re.setFundingtree(f.stream() + .map(s -> s.toString()) + .collect(Collectors.toList())); + } + break; } - return new EntityRelEntity().setSource( new TypedRow() .setSourceId(e.getSource().getSourceId()) .setDeleted(e.getSource().getDeleted()) .setType(e.getSource().getType()) - .setOaf(new ObjectMapper().writeValueAsString(re))); + .setOaf(serialize(re))); } - private KeyValue asKV(LinkedHashMap j) { + private static KeyValue asKV(LinkedHashMap j) { final KeyValue kv = new KeyValue(); kv.setKey((String) j.get("key")); kv.setValue((String) j.get("value")); return kv; } - private void mapTitle(DocumentContext j, RelatedEntity re) { - JSONArray a = j.read("$.title"); + private static void mapTitle(DocumentContext j, RelatedEntity re) { + final JSONArray a = j.read("$.title"); if (!a.isEmpty()) { re.setTitle(asStructuredProperty((LinkedHashMap) a.get(0))); } } - private StructuredProperty asStructuredProperty(LinkedHashMap j) { + private static StructuredProperty asStructuredProperty(LinkedHashMap j) { final StructuredProperty sp = new StructuredProperty(); sp.setValue((String) j.get("value")); sp.setQualifier(asQualifier((LinkedHashMap) j.get("qualifier"))); return sp; - } - public Qualifier asQualifier(LinkedHashMap j) { - Qualifier q = new Qualifier(); + public static Qualifier asQualifier(LinkedHashMap j) { + final Qualifier q = new Qualifier(); q.setClassid(j.get("classid")); q.setClassname(j.get("classname")); q.setSchemeid(j.get("schemeid")); @@ -100,4 +111,14 @@ public class MappingUtils { return q; } + public static String serialize(final Object o) { + try { + return new ObjectMapper() + .setSerializationInclusion(JsonInclude.Include.NON_NULL) + .writeValueAsString(o); + } catch (JsonProcessingException e) { + throw new IllegalArgumentException("unable to serialize: " + o.toString(), e); + } + } + } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/RelatedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/RelatedEntity.java index a441392b2..afd6e310b 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/RelatedEntity.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/RelatedEntity.java @@ -3,14 +3,22 @@ package eu.dnetlib.dhp.graph; import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import org.codehaus.jackson.map.ObjectMapper; +import java.io.IOException; import java.io.Serializable; import java.util.List; public class RelatedEntity implements Serializable { + private String id; + private String type; + + // common fields + private StructuredProperty title; + private String websiteurl; // datasource, organizations, projects + // results - private StructuredProperty title; // also for projects private String dateofacceptance; private String publisher; private List pid; @@ -20,11 +28,10 @@ public class RelatedEntity implements Serializable { // datasource private String officialname; - private String websiteurl; // also for organizations, projects private Qualifier datasourcetype; private Qualifier datasourcetypeui; - //private String aggregatortype; private Qualifier openairecompatibility; + //private String aggregatortype; // organization private String legalname; @@ -32,10 +39,28 @@ public class RelatedEntity implements Serializable { private Qualifier country; // project + private String projectTitle; private String code; private String acronym; private Qualifier contracttype; - private String fundingtree; + private List fundingtree; + + public static RelatedEntity parse(final String json) { + try { + return new ObjectMapper().readValue(json, RelatedEntity.class); + } catch (IOException e) { + throw new IllegalArgumentException("invalid RelatedEntity, cannot parse: " + json); + } + } + + public String getId() { + return id; + } + + public RelatedEntity setId(String id) { + this.id = id; + return this; + } public StructuredProperty getTitle() { return title; @@ -199,12 +224,30 @@ public class RelatedEntity implements Serializable { return this; } - public String getFundingtree() { + public List getFundingtree() { return fundingtree; } - public RelatedEntity setFundingtree(String fundingtree) { + public RelatedEntity setFundingtree(List fundingtree) { this.fundingtree = fundingtree; return this; } + + public String getProjectTitle() { + return projectTitle; + } + + public RelatedEntity setProjectTitle(String projectTitle) { + this.projectTitle = projectTitle; + return this; + } + + public String getType() { + return type; + } + + public RelatedEntity setType(String type) { + this.type = type; + return this; + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkGraphIndexingJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkGraphIndexingJob.java index 1d55dda89..3915bef08 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkGraphIndexingJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkGraphIndexingJob.java @@ -38,6 +38,7 @@ public class SparkGraphIndexingJob { } new GraphJoiner().join(spark, inputPath, hiveDbName, OUTPUT_BASE_PATH); + new GraphMapper().map(spark, OUTPUT_BASE_PATH); } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Tuple.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TupleWrapper.java similarity index 70% rename from dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Tuple.java rename to dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TupleWrapper.java index 1eb0491a7..eb60e1474 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Tuple.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TupleWrapper.java @@ -2,7 +2,7 @@ package eu.dnetlib.dhp.graph; import java.io.Serializable; -public class Tuple implements Serializable { +public class TupleWrapper implements Serializable { private TypedRow relation; @@ -13,7 +13,7 @@ public class Tuple implements Serializable { return relation; } - public Tuple setRelation(TypedRow relation) { + public TupleWrapper setRelation(TypedRow relation) { this.relation = relation; return this; } @@ -22,7 +22,7 @@ public class Tuple implements Serializable { return target; } - public Tuple setTarget(TypedRow target) { + public TupleWrapper setTarget(TypedRow target) { this.target = target; return this; } diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/MappingUtilsTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/MappingUtilsTest.java index 2edb0aa70..199d12132 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/MappingUtilsTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/MappingUtilsTest.java @@ -39,4 +39,14 @@ public class MappingUtilsTest { System.out.println(out); } + + @Test + public void testParseRelatedEntity() throws IOException { + + final InputStreamReader in = new InputStreamReader(getClass().getResourceAsStream("related_entity.json")); + final RelatedEntity e = new ObjectMapper().readValue(in, RelatedEntity.class); + + System.out.println(e); + + } } diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/related_entity.json b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/related_entity.json new file mode 100644 index 000000000..25c92baa3 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/related_entity.json @@ -0,0 +1,5 @@ +{ + "id": "20|nih_________::6b8108b6d6399f7163a6a7ccdd0efc2d", + "type": "organization", + "legalname": "MCGILL UNIVERSITY" +} \ No newline at end of file From 1ecca69f499c229b82e25501b70201f760eca4e5 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 30 Jan 2020 17:45:28 +0100 Subject: [PATCH 17/45] added annotation to ignore method during the serialization --- .../eu/dnetlib/dhp/schema/oaf/GeoLocation.java | 3 ++- .../java/eu/dnetlib/dhp/schema/oaf/KeyValue.java | 2 ++ .../java/eu/dnetlib/dhp/schema/oaf/Qualifier.java | 15 +++++++++++---- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java index 43af60286..1839fbd53 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java @@ -1,5 +1,6 @@ package eu.dnetlib.dhp.schema.oaf; +import com.fasterxml.jackson.annotation.JsonIgnore; import org.apache.commons.lang3.StringUtils; import java.io.Serializable; @@ -36,7 +37,7 @@ public class GeoLocation implements Serializable { this.place = place; } - + @JsonIgnore public boolean isBlank() { return StringUtils.isBlank(point) && StringUtils.isBlank(box) && diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java index 74d9f77bd..5a841b96f 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java @@ -1,5 +1,6 @@ package eu.dnetlib.dhp.schema.oaf; +import com.fasterxml.jackson.annotation.JsonIgnore; import org.apache.commons.lang3.StringUtils; import java.io.Serializable; @@ -40,6 +41,7 @@ public class KeyValue implements Serializable { return isBlank()?"":String.format("%s::%s", key != null ? key.toLowerCase() : "", value != null ? value.toLowerCase() : ""); } + @JsonIgnore public boolean isBlank() { return StringUtils.isBlank(key) && StringUtils.isBlank(value); } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java index 7e4660f4b..00ae88c52 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java @@ -1,5 +1,6 @@ package eu.dnetlib.dhp.schema.oaf; +import com.fasterxml.jackson.annotation.JsonIgnore; import org.apache.commons.lang3.StringUtils; import java.io.Serializable; @@ -15,32 +16,36 @@ public class Qualifier implements Serializable { return classid; } - public void setClassid(String classid) { + public Qualifier setClassid(String classid) { this.classid = classid; + return this; } public String getClassname() { return classname; } - public void setClassname(String classname) { + public Qualifier setClassname(String classname) { this.classname = classname; + return this; } public String getSchemeid() { return schemeid; } - public void setSchemeid(String schemeid) { + public Qualifier setSchemeid(String schemeid) { this.schemeid = schemeid; + return this; } public String getSchemename() { return schemename; } - public void setSchemename(String schemename) { + public Qualifier setSchemename(String schemename) { this.schemename = schemename; + return this; } public String toComparableString() { @@ -50,6 +55,8 @@ public class Qualifier implements Serializable { schemeid != null ? schemeid : "", schemename != null ? schemename : ""); } + + @JsonIgnore public boolean isBlank() { return StringUtils.isBlank(classid) && StringUtils.isBlank(classname) && From b2691a3b0a09c3d8bb04272e584491f190763f85 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 30 Jan 2020 17:46:29 +0100 Subject: [PATCH 18/45] save adjacency list as JoinedEntity --- .../job-override.properties | 3 +- .../eu/dnetlib/dhp/graph/EntityRelEntity.java | 1 + .../eu/dnetlib/dhp/graph/GraphJoiner.java | 224 ++++++++++++++---- .../dnetlib/dhp/graph/GraphMappingUtils.java | 137 +++++++++++ .../eu/dnetlib/dhp/graph/JoinedEntity.java | 44 ++++ .../eu/dnetlib/dhp/graph/MappingUtils.java | 103 -------- .../eu/dnetlib/dhp/graph/RelatedEntity.java | 57 ++++- ...Job.java => SparkXmlRecordBuilderJob.java} | 23 +- .../dhp/graph/input_graph_parameters.json | 2 +- .../dnetlib/dhp/graph/oozie_app/workflow.xml | 11 +- .../dnetlib/dhp/graph/MappingUtilsTest.java | 4 +- 11 files changed, 427 insertions(+), 182 deletions(-) create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/JoinedEntity.java delete mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/MappingUtils.java rename dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/{SparkGraphIndexingJob.java => SparkXmlRecordBuilderJob.java} (54%) diff --git a/dhp-workflows/dhp-graph-provision/job-override.properties b/dhp-workflows/dhp-graph-provision/job-override.properties index 1870b0e6e..acaf16717 100644 --- a/dhp-workflows/dhp-graph-provision/job-override.properties +++ b/dhp-workflows/dhp-graph-provision/job-override.properties @@ -1,4 +1,5 @@ sparkDriverMemory=7G sparkExecutorMemory=7G hive_db_name=claudio -sourcePath=/tmp/db_openaireplus_services_beta.export.2019.11.06 \ No newline at end of file +sourcePath=/tmp/db_openaireplus_services_beta.export.2019.11.06 +outputPath=/tmp/openaire_provision \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityRelEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityRelEntity.java index b0711bbff..285cacbc0 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityRelEntity.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityRelEntity.java @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.graph; import java.io.Serializable; public class EntityRelEntity implements Serializable { + private TypedRow source; private TypedRow relation; private TypedRow target; diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java index 96d1f150a..f7bf0da39 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java @@ -1,11 +1,12 @@ package eu.dnetlib.dhp.graph; +import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.jayway.jsonpath.DocumentContext; import com.jayway.jsonpath.JsonPath; -import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.*; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.api.java.JavaPairRDD; @@ -15,8 +16,10 @@ import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.SparkSession; import scala.Tuple2; +import java.io.IOException; import java.io.Serializable; import java.util.List; +import java.util.stream.Collectors; /** * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. @@ -40,21 +43,32 @@ public class GraphJoiner implements Serializable { public static final int MAX_RELS = 10; - public void join(final SparkSession spark, final String inputPath, final String hiveDbName, final String outPath) { + private SparkSession spark; - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + private String inputPath; + + private String outPath; + + public GraphJoiner(SparkSession spark, String inputPath, String outPath) { + this.spark = spark; + this.inputPath = inputPath; + this.outPath = outPath; + } + + public GraphJoiner adjacencyLists() { + final JavaSparkContext sc = new JavaSparkContext(getSpark().sparkContext()); // read each entity - JavaPairRDD datasource = readPathEntity(sc, inputPath, "datasource"); - JavaPairRDD organization = readPathEntity(sc, inputPath, "organization"); - JavaPairRDD project = readPathEntity(sc, inputPath, "project"); - JavaPairRDD dataset = readPathEntity(sc, inputPath, "dataset"); - JavaPairRDD otherresearchproduct = readPathEntity(sc, inputPath, "otherresearchproduct"); - JavaPairRDD software = readPathEntity(sc, inputPath, "software"); - JavaPairRDD publication = readPathEntity(sc, inputPath, "publication"); + JavaPairRDD datasource = readPathEntity(sc, getInputPath(), "datasource"); + JavaPairRDD organization = readPathEntity(sc, getInputPath(), "organization"); + JavaPairRDD project = readPathEntity(sc, getInputPath(), "project"); + JavaPairRDD dataset = readPathEntity(sc, getInputPath(), "dataset"); + JavaPairRDD otherresearchproduct = readPathEntity(sc, getInputPath(), "otherresearchproduct"); + JavaPairRDD software = readPathEntity(sc, getInputPath(), "software"); + JavaPairRDD publication = readPathEntity(sc, getInputPath(), "publication"); // create the union between all the entities - final String entitiesPath = outPath + "/entities"; + final String entitiesPath = getOutPath() + "/0_entities"; datasource .union(organization) .union(project) @@ -63,7 +77,7 @@ public class GraphJoiner implements Serializable { .union(software) .union(publication) .map(e -> new EntityRelEntity().setSource(e._2())) - .map(e -> new ObjectMapper().writeValueAsString(e)) + .map(GraphMappingUtils::serialize) .saveAsTextFile(entitiesPath, GzipCodec.class); JavaPairRDD entities = sc.textFile(entitiesPath) @@ -71,7 +85,7 @@ public class GraphJoiner implements Serializable { .mapToPair(t -> new Tuple2<>(t.getSource().getSourceId(), t)); // reads the relationships - final JavaPairRDD relation = readPathRelation(sc, inputPath) + final JavaPairRDD relation = readPathRelation(sc, getInputPath()) .filter(r -> !r.getDeleted()) //only consider those that are not virtually deleted .map(p -> new EntityRelEntity().setRelation(p)) .mapToPair(p -> new Tuple2<>(p.getRelation().getSourceId(), p)) @@ -80,45 +94,156 @@ public class GraphJoiner implements Serializable { .flatMap(p -> p.iterator()) .mapToPair(p -> new Tuple2<>(p.getRelation().getTargetId(), p)); - final String joinByTargetPath = outPath + "/join_by_target"; + final String joinByTargetPath = getOutPath() + "/1_join_by_target"; relation .join(entities .filter(e -> !e._2().getSource().getDeleted()) - /*.mapToPair(e -> new Tuple2<>(e._1(), new MappingUtils().pruneModel(e._2())))*/) + .mapToPair(e -> new Tuple2<>(e._1(), new GraphMappingUtils().pruneModel(e._2())))) .map(s -> new EntityRelEntity() .setRelation(s._2()._1().getRelation()) .setTarget(s._2()._2().getSource())) - .map(e -> new ObjectMapper().writeValueAsString(e)) + .map(GraphMappingUtils::serialize) .saveAsTextFile(joinByTargetPath, GzipCodec.class); JavaPairRDD bySource = sc.textFile(joinByTargetPath) .map(t -> new ObjectMapper().readValue(t, EntityRelEntity.class)) .mapToPair(t -> new Tuple2<>(t.getRelation().getSourceId(), t)); + final String linkedEntityPath = getOutPath() + "/2_linked_entities"; entities .union(bySource) .groupByKey() // by source id - .map(p -> { - final LinkedEntity e = new LinkedEntity(); - final List links = Lists.newArrayList(); - for(EntityRelEntity rel : p._2()) { - if (rel.hasMainEntity() & e.getEntity() == null) { - e.setEntity(rel.getSource()); - } - if (rel.hasRelatedEntity()) { - links.add(new Tuple() - .setRelation(rel.getRelation()) - .setTarget(rel.getTarget())); - } - } - e.setLinks(links); - if (e.getEntity() == null) { - throw new IllegalStateException("missing main entity on '" + p._1() + "'"); - } - return e; - }) - .map(e -> new ObjectMapper().writeValueAsString(e)) - .saveAsTextFile(outPath + "/linked_entities", GzipCodec.class); + .map(p -> toLinkedEntity(p)) + .map(e -> new ObjectMapper().setSerializationInclusion(JsonInclude.Include.NON_NULL).writeValueAsString(e)) + .saveAsTextFile(linkedEntityPath, GzipCodec.class); + + final String joinedEntitiesPath = getOutPath() + "/3_joined_entities"; + sc.textFile(linkedEntityPath) + .map(s -> new ObjectMapper().readValue(s, LinkedEntity.class)) + .map(l -> toJoinedEntity(l)) + .map(j -> new ObjectMapper().setSerializationInclusion(JsonInclude.Include.NON_NULL).writeValueAsString(j)) + .saveAsTextFile(joinedEntitiesPath); + + return this; + } + + public GraphJoiner asXML() { + final JavaSparkContext sc = new JavaSparkContext(getSpark().sparkContext()); + + final String joinedEntitiesPath = getOutPath() + "/3_joined_entities"; + sc.textFile(joinedEntitiesPath) + .map(s -> new ObjectMapper().readValue(s, LinkedEntity.class)) + .map(l -> toXML(l)) + .saveAsTextFile(getOutPath() + "/4_xml"); + + return this; + } + + private String toXML(LinkedEntity l) { + + return null; + } + + public SparkSession getSpark() { + return spark; + } + + public GraphJoiner setSpark(SparkSession spark) { + this.spark = spark; + return this; + } + + public String getInputPath() { + return inputPath; + } + + public GraphJoiner setInputPath(String inputPath) { + this.inputPath = inputPath; + return this; + } + + public String getOutPath() { + return outPath; + } + + public GraphJoiner setOutPath(String outPath) { + this.outPath = outPath; + return this; + } + + // HELPERS + + private OafEntity parseOaf(final String json, final String type) { + final ObjectMapper o = new ObjectMapper(); + try { + switch (type) { + case "publication": + return o.readValue(json, Publication.class); + case "dataset": + return o.readValue(json, Dataset.class); + case "otherresearchproduct": + return o.readValue(json, OtherResearchProduct.class); + case "software": + return o.readValue(json, Software.class); + case "datasource": + return o.readValue(json, Datasource.class); + case "organization": + return o.readValue(json, Organization.class); + case "project": + return o.readValue(json, Project.class); + default: + throw new IllegalArgumentException("invalid type: " + type); + } + } catch (IOException e) { + throw new IllegalArgumentException(e); + } + } + + /** + * Converts the result of grouping pairs and the entities by source id to LinkedEntity + * @param p + * @return + */ + private LinkedEntity toLinkedEntity(Tuple2> p) { + final LinkedEntity e = new LinkedEntity(); + final List links = Lists.newArrayList(); + for(EntityRelEntity rel : p._2()) { + if (rel.hasMainEntity() & e.getEntity() == null) { + e.setEntity(rel.getSource()); + } + if (rel.hasRelatedEntity()) { + links.add(new Tuple() + .setRelation(rel.getRelation()) + .setTarget(rel.getTarget())); + } + } + e.setLinks(links); + if (e.getEntity() == null) { + throw new IllegalStateException("missing main entity on '" + p._1() + "'"); + } + return e; + } + + /** + * Converts a LinkedEntity to a JoinedEntity + * @param l + * @return + */ + private JoinedEntity toJoinedEntity(LinkedEntity l) { + return new JoinedEntity().setType(l.getEntity().getType()) + .setEntity(parseOaf(l.getEntity().getOaf(), l.getEntity().getType())) + .setLinks(l.getLinks() + .stream() + .map(t -> { + final ObjectMapper o = new ObjectMapper(); + try { + return new Tuple2<>( + o.readValue(t.getRelation().getOaf(), Relation.class), + o.readValue(t.getTarget().getOaf(), RelatedEntity.class)); + } catch (IOException e) { + throw new IllegalArgumentException(e); + } + }).collect(Collectors.toList())); } /** @@ -132,14 +257,14 @@ public class GraphJoiner implements Serializable { private JavaPairRDD readPathEntity(final JavaSparkContext sc, final String inputPath, final String type) { return sc.sequenceFile(inputPath + "/" + type, Text.class, Text.class) .mapToPair((PairFunction, String, TypedRow>) item -> { - - final String json = item._2().toString(); - final String id = JsonPath.read(json, "$.id"); + final String s = item._2().toString(); + final DocumentContext json = JsonPath.parse(s); + final String id = json.read("$.id"); return new Tuple2<>(id, new TypedRow() - .setSourceId(id) - .setDeleted(JsonPath.read(json, "$.dataInfo.deletedbyinference")) - .setType(type) - .setOaf(json)); + .setSourceId(id) + .setDeleted(json.read("$.dataInfo.deletedbyinference")) + .setType(type) + .setOaf(s)); }); } @@ -153,13 +278,14 @@ public class GraphJoiner implements Serializable { private JavaRDD readPathRelation(final JavaSparkContext sc, final String inputPath) { return sc.sequenceFile(inputPath + "/relation", Text.class, Text.class) .map(item -> { - final String json = item._2().toString(); + final String s = item._2().toString(); + final DocumentContext json = JsonPath.parse(s); return new TypedRow() - .setSourceId(JsonPath.read(json, "$.source")) - .setTargetId(JsonPath.read(json, "$.target")) - .setDeleted(JsonPath.read(json, "$.dataInfo.deletedbyinference")) + .setSourceId(json.read("$.source")) + .setTargetId(json.read("$.target")) + .setDeleted(json.read("$.dataInfo.deletedbyinference")) .setType("relation") - .setOaf(json); + .setOaf(s); }); } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java index ab19ff2b5..e3622cd20 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java @@ -1,9 +1,18 @@ package eu.dnetlib.dhp.graph; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Maps; +import com.jayway.jsonpath.DocumentContext; +import com.jayway.jsonpath.JsonPath; import eu.dnetlib.dhp.schema.oaf.*; +import net.minidev.json.JSONArray; +import org.apache.commons.lang3.StringUtils; +import java.util.LinkedHashMap; import java.util.Map; +import java.util.stream.Collectors; public class GraphMappingUtils { @@ -20,4 +29,132 @@ public class GraphMappingUtils { types.put("relation", Relation.class); } + public static EntityRelEntity pruneModel(EntityRelEntity e) { + + final DocumentContext j = JsonPath.parse(e.getSource().getOaf()); + final RelatedEntity re = new RelatedEntity().setId(j.read("$.id")).setType(e.getSource().getType()); + + switch (e.getSource().getType()) { + case "publication": + case "dataset": + case "otherresearchproduct": + case "software": + mapTitle(j, re); + re.setDateofacceptance(j.read("$.dateofacceptance.value")); + re.setPublisher(j.read("$.publisher.value")); + + JSONArray pids = j.read("$.pid"); + re.setPid(pids.stream() + .map(p -> asStructuredProperty((LinkedHashMap) p)) + .collect(Collectors.toList())); + + re.setResulttype(asQualifier(j.read("$.resulttype"))); + + JSONArray collfrom = j.read("$.collectedfrom"); + re.setCollectedfrom(collfrom.stream() + .map(c -> asKV((LinkedHashMap)c)) + .collect(Collectors.toList())); + + //TODO still to be mapped + //re.setCodeRepositoryUrl(j.read("$.coderepositoryurl")); + + break; + case "datasource": + re.setOfficialname(j.read("$.officialname.value")); + re.setWebsiteurl(j.read("$.websiteurl.value")); + re.setDatasourcetype(asQualifier(j.read("$.datasourcetype"))); + re.setOpenairecompatibility(asQualifier(j.read("$.openairecompatibility"))); + + break; + case "organization": + re.setLegalname(j.read("$.legalname.value")); + re.setLegalshortname(j.read("$.legalshortname.value")); + re.setCountry(asQualifier(j.read("$.country"))); + + break; + case "project": + re.setProjectTitle(j.read("$.title.value")); + re.setCode(j.read("$.code.value")); + re.setAcronym(j.read("$.acronym.value")); + re.setContracttype(asQualifier(j.read("$.contracttype"))); + + JSONArray f = j.read("$.fundingtree"); + if (!f.isEmpty()) { + re.setFundingtree(f.stream() + .map(s -> s.toString()) + .collect(Collectors.toList())); + } + + break; + } + return new EntityRelEntity().setSource( + new TypedRow() + .setSourceId(e.getSource().getSourceId()) + .setDeleted(e.getSource().getDeleted()) + .setType(e.getSource().getType()) + .setOaf(serialize(re))); + } + + private static KeyValue asKV(LinkedHashMap j) { + final KeyValue kv = new KeyValue(); + kv.setKey((String) j.get("key")); + kv.setValue((String) j.get("value")); + return kv; + } + + private static void mapTitle(DocumentContext j, RelatedEntity re) { + final JSONArray a = j.read("$.title"); + if (!a.isEmpty()) { + final StructuredProperty sp = asStructuredProperty((LinkedHashMap) a.get(0)); + if(StringUtils.isNotBlank(sp.getValue())) { + re.setTitle(sp); + } + } + } + + private static StructuredProperty asStructuredProperty(LinkedHashMap j) { + final StructuredProperty sp = new StructuredProperty(); + final String value = (String) j.get("value"); + if (StringUtils.isNotBlank(value)) { + sp.setValue((String) j.get("value")); + sp.setQualifier(asQualifier((LinkedHashMap) j.get("qualifier"))); + } + return sp; + } + + public static Qualifier asQualifier(LinkedHashMap j) { + final Qualifier q = new Qualifier(); + + final String classid = j.get("classid"); + if (StringUtils.isNotBlank(classid)) { + q.setClassid(classid); + } + + final String classname = j.get("classname"); + if (StringUtils.isNotBlank(classname)) { + q.setClassname(classname); + } + + final String schemeid = j.get("schemeid"); + if (StringUtils.isNotBlank(schemeid)) { + q.setSchemeid(schemeid); + } + + final String schemename = j.get("schemename"); + if (StringUtils.isNotBlank(schemename)) { + q.setSchemename(schemename); + } + return q; + } + + public static String serialize(final Object o) { + try { + return new ObjectMapper() + .setSerializationInclusion(JsonInclude.Include.NON_NULL) + .writeValueAsString(o); + } catch (JsonProcessingException e) { + throw new IllegalArgumentException("unable to serialize: " + o.toString(), e); + } + } + } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/JoinedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/JoinedEntity.java new file mode 100644 index 000000000..d65eb64c8 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/JoinedEntity.java @@ -0,0 +1,44 @@ +package eu.dnetlib.dhp.graph; + +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.dhp.schema.oaf.Relation; +import scala.Tuple2; + +import java.io.Serializable; +import java.util.List; + +public class JoinedEntity implements Serializable { + + private String type; + + private OafEntity entity; + + private List> links; + + public String getType() { + return type; + } + + public JoinedEntity setType(String type) { + this.type = type; + return this; + } + + public OafEntity getEntity() { + return entity; + } + + public JoinedEntity setEntity(OafEntity entity) { + this.entity = entity; + return this; + } + + public List> getLinks() { + return links; + } + + public JoinedEntity setLinks(List> links) { + this.links = links; + return this; + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/MappingUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/MappingUtils.java deleted file mode 100644 index 756506c12..000000000 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/MappingUtils.java +++ /dev/null @@ -1,103 +0,0 @@ -package eu.dnetlib.dhp.graph; - -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.jayway.jsonpath.DocumentContext; -import com.jayway.jsonpath.JsonPath; -import eu.dnetlib.dhp.schema.oaf.KeyValue; -import eu.dnetlib.dhp.schema.oaf.Qualifier; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import net.minidev.json.JSONArray; - -import java.util.LinkedHashMap; -import java.util.stream.Collectors; - -public class MappingUtils { - - public EntityRelEntity pruneModel(EntityRelEntity e) throws JsonProcessingException { - - final DocumentContext j = JsonPath.parse(e.getSource().getOaf()); - final RelatedEntity re = new RelatedEntity(); - - switch (e.getSource().getType()) { - case "publication": - case "dataset": - case "otherresearchproduct": - case "software": - - mapTitle(j, re); - re.setDateofacceptance(j.read("$.dateofacceptance.value")); - re.setPublisher(j.read("$.publisher.value")); - - JSONArray pids = j.read("$.pid"); - re.setPid(pids.stream() - .map(p -> asStructuredProperty((LinkedHashMap) p)) - .collect(Collectors.toList())); - - re.setResulttype(asQualifier(j.read("$.resulttype"))); - - JSONArray collfrom = j.read("$.collectedfrom"); - re.setCollectedfrom(collfrom.stream() - .map(c -> asKV((LinkedHashMap)c)) - .collect(Collectors.toList())); - - //TODO still to be mapped - //re.setCodeRepositoryUrl(j.read("$.coderepositoryurl")); - - break; - case "datasource": - re.setOfficialname(j.read("$.officialname.value")); - re.setWebsiteurl(j.read("$.websiteurl.value")); - - re.setDatasourcetype(asQualifier(j.read("$.datasourcetype"))); - re.setOpenairecompatibility(asQualifier(j.read("$.openairecompatibility"))); - - break; - case "organization": - - break; - case "project": - mapTitle(j, re); - break; - } - - return new EntityRelEntity().setSource( - new TypedRow() - .setSourceId(e.getSource().getSourceId()) - .setDeleted(e.getSource().getDeleted()) - .setType(e.getSource().getType()) - .setOaf(new ObjectMapper().writeValueAsString(re))); - } - - private KeyValue asKV(LinkedHashMap j) { - final KeyValue kv = new KeyValue(); - kv.setKey((String) j.get("key")); - kv.setValue((String) j.get("value")); - return kv; - } - - private void mapTitle(DocumentContext j, RelatedEntity re) { - JSONArray a = j.read("$.title"); - if (!a.isEmpty()) { - re.setTitle(asStructuredProperty((LinkedHashMap) a.get(0))); - } - } - - private StructuredProperty asStructuredProperty(LinkedHashMap j) { - final StructuredProperty sp = new StructuredProperty(); - sp.setValue((String) j.get("value")); - sp.setQualifier(asQualifier((LinkedHashMap) j.get("qualifier"))); - return sp; - - } - - public Qualifier asQualifier(LinkedHashMap j) { - Qualifier q = new Qualifier(); - q.setClassid(j.get("classid")); - q.setClassname(j.get("classname")); - q.setSchemeid(j.get("schemeid")); - q.setSchemename(j.get("schemename")); - return q; - } - -} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/RelatedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/RelatedEntity.java index a441392b2..50b97dace 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/RelatedEntity.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/RelatedEntity.java @@ -3,14 +3,22 @@ package eu.dnetlib.dhp.graph; import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import org.codehaus.jackson.map.ObjectMapper; +import java.io.IOException; import java.io.Serializable; import java.util.List; public class RelatedEntity implements Serializable { + private String id; + private String type; + + // common fields + private StructuredProperty title; + private String websiteurl; // datasource, organizations, projects + // results - private StructuredProperty title; // also for projects private String dateofacceptance; private String publisher; private List pid; @@ -20,11 +28,10 @@ public class RelatedEntity implements Serializable { // datasource private String officialname; - private String websiteurl; // also for organizations, projects private Qualifier datasourcetype; private Qualifier datasourcetypeui; - //private String aggregatortype; private Qualifier openairecompatibility; + //private String aggregatortype; // organization private String legalname; @@ -32,10 +39,28 @@ public class RelatedEntity implements Serializable { private Qualifier country; // project + private String projectTitle; private String code; private String acronym; private Qualifier contracttype; - private String fundingtree; + private List fundingtree; + + public static RelatedEntity parse(final String json) { + try { + return new ObjectMapper().readValue(json, RelatedEntity.class); + } catch (IOException e) { + throw new IllegalArgumentException("invalid RelatedEntity, cannot parse: " + json); + } + } + + public String getId() { + return id; + } + + public RelatedEntity setId(String id) { + this.id = id; + return this; + } public StructuredProperty getTitle() { return title; @@ -199,12 +224,30 @@ public class RelatedEntity implements Serializable { return this; } - public String getFundingtree() { + public List getFundingtree() { return fundingtree; } - public RelatedEntity setFundingtree(String fundingtree) { + public RelatedEntity setFundingtree(List fundingtree) { this.fundingtree = fundingtree; return this; } -} + + public String getProjectTitle() { + return projectTitle; + } + + public RelatedEntity setProjectTitle(String projectTitle) { + this.projectTitle = projectTitle; + return this; + } + + public String getType() { + return type; + } + + public RelatedEntity setType(String type) { + this.type = type; + return this; + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkGraphIndexingJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlRecordBuilderJob.java similarity index 54% rename from dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkGraphIndexingJob.java rename to dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlRecordBuilderJob.java index 1d55dda89..38bc2bae2 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkGraphIndexingJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlRecordBuilderJob.java @@ -7,37 +7,34 @@ import org.apache.hadoop.fs.Path; import org.apache.spark.SparkConf; import org.apache.spark.sql.SparkSession; -public class SparkGraphIndexingJob { - - private final static String OUTPUT_BASE_PATH = "/tmp/openaire_provision"; +public class SparkXmlRecordBuilderJob { public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGraphIndexingJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_graph_parameters.json"))); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkXmlRecordBuilderJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_graph_parameters.json"))); parser.parseArgument(args); final SparkConf conf = new SparkConf() - .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") - .set("hive.metastore.uris", parser.get("hive_metastore_uris")); + .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); final SparkSession spark = SparkSession .builder() .config(conf) - .appName(SparkGraphIndexingJob.class.getSimpleName()) + .appName(SparkXmlRecordBuilderJob.class.getSimpleName()) .master(parser.get("master")) - .enableHiveSupport() .getOrCreate(); final String inputPath = parser.get("sourcePath"); - final String hiveDbName = parser.get("hive_db_name"); + final String outputPath = parser.get("outputPath"); final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); - if (fs.exists(new Path(OUTPUT_BASE_PATH))) { - fs.delete(new Path(OUTPUT_BASE_PATH), true); - fs.mkdirs(new Path(OUTPUT_BASE_PATH)); + if (fs.exists(new Path(outputPath))) { + fs.delete(new Path(outputPath), true); + fs.mkdirs(new Path(outputPath)); } - new GraphJoiner().join(spark, inputPath, hiveDbName, OUTPUT_BASE_PATH); + new GraphJoiner(spark, inputPath, outputPath) + .adjacencyLists(); } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json index a197abc78..3a02ab1a0 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json @@ -1,6 +1,6 @@ [ {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, {"paramName":"h", "paramLongName":"hive_metastore_uris","paramDescription": "the hive metastore uris", "paramRequired": true}, - {"paramName":"db", "paramLongName":"hive_db_name", "paramDescription": "the target hive database name", "paramRequired": true}, + {"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the path used to store temporary output files", "paramRequired": true}, {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true} ] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml index a91759ade..4b4d2c7bf 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml @@ -26,20 +26,20 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + ${jobTracker} ${nameNode} yarn-cluster cluster - GraphIndexing - eu.dnetlib.dhp.graph.SparkGraphIndexingJob + build_adjacency_lists + eu.dnetlib.dhp.graph.SparkXmlRecordBuilderJob dhp-graph-provision-${projectVersion}.jar --executor-memory ${sparkExecutorMemory} @@ -53,8 +53,7 @@ -mt yarn-cluster --sourcePath${sourcePath} - --hive_db_name${hive_db_name} - --hive_metastore_uris${hive_metastore_uris} + --outputPath${outputPath} diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/MappingUtilsTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/MappingUtilsTest.java index 2edb0aa70..0deb3d81a 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/MappingUtilsTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/MappingUtilsTest.java @@ -9,11 +9,11 @@ import java.io.InputStreamReader; public class MappingUtilsTest { - private MappingUtils utils; + private GraphMappingUtils utils; @Before public void setUp() { - utils = new MappingUtils(); + utils = new GraphMappingUtils(); } @Test From b5e1e2e5b290efd9e3aaad12d6dfcd37cecce8ab Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 30 Jan 2020 18:11:04 +0100 Subject: [PATCH 19/45] reintegrated changes from fcbc4ccd70b7edfdb8c041dc9dbdaed5943fa13a --- .../eu/dnetlib/dhp/graph/GraphMapper.java | 77 ------------------- .../main/java/eu/dnetlib/dhp/graph/Link.java | 30 -------- .../eu/dnetlib/dhp/graph/LinkedEntity.java | 25 ++---- .../dhp/graph/LinkedEntityWrapper.java | 40 ---------- .../dhp/graph/SparkXmlRecordBuilderJob.java | 3 +- .../main/java/eu/dnetlib/dhp/graph/Tuple.java | 29 +++++++ 6 files changed, 37 insertions(+), 167 deletions(-) delete mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphMapper.java delete mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Link.java delete mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntityWrapper.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Tuple.java diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphMapper.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphMapper.java deleted file mode 100644 index bdfea7979..000000000 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphMapper.java +++ /dev/null @@ -1,77 +0,0 @@ -package eu.dnetlib.dhp.graph; - -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.schema.oaf.*; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SparkSession; - -import java.io.IOException; -import java.util.stream.Collectors; - -public class GraphMapper { - - - public void map(final SparkSession spark, final String outPath) { - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - - sc.textFile(outPath + "/linked_entities") - .map(LinkedEntityWrapper::parse) - .map(GraphMapper::asLinkedEntity) - .map(e -> new ObjectMapper().writeValueAsString(e)) - .saveAsTextFile(outPath + "/linked_entities_types"); - } - - private static LinkedEntity asLinkedEntity(final LinkedEntityWrapper lw) throws JsonProcessingException { - final LinkedEntity le = new LinkedEntity(); - - try { - le.setType(lw.getEntity().getType()); - le.setEntity(parseEntity(lw.getEntity().getOaf(), le.getType())); - le.setLinks(lw.getLinks() - .stream() - .map(l -> new Link() - .setRelation(parseRelation(l.getRelation().getOaf())) - .setRelatedEntity(RelatedEntity.parse(l.getTarget().getOaf()))) - .collect(Collectors.toList())); - return le; - } catch (IllegalArgumentException e) { - throw new IllegalArgumentException(new ObjectMapper().writeValueAsString(lw), e); - } - } - - private static Relation parseRelation(final String s) { - try { - return new ObjectMapper().readValue(s, Relation.class); - } catch (IOException e) { - throw new IllegalArgumentException("unable to decode Relation: " + s); - } - } - - private static OafEntity parseEntity(final String json, final String type) { - final ObjectMapper o = new ObjectMapper(); - try { - switch (type) { - case "publication": - return o.readValue(json, Publication.class); - case "dataset": - return o.readValue(json, Dataset.class); - case "otherresearchproduct": - return o.readValue(json, OtherResearchProduct.class); - case "software": - return o.readValue(json, Software.class); - case "datasource": - return o.readValue(json, Datasource.class); - case "project": - return o.readValue(json, Project.class); - case "organization": - return o.readValue(json, Organization.class); - default: - throw new IllegalArgumentException("invalid entity type: " + type); - } - } catch (IOException e) { - throw new IllegalArgumentException("unable to decode oaf entity: " + json); - } - } -} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Link.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Link.java deleted file mode 100644 index 8426fbd12..000000000 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Link.java +++ /dev/null @@ -1,30 +0,0 @@ -package eu.dnetlib.dhp.graph; - -import eu.dnetlib.dhp.schema.oaf.Relation; - -import java.io.Serializable; - -public class Link implements Serializable { - - private Relation relation; - - private RelatedEntity relatedEntity; - - public Relation getRelation() { - return relation; - } - - public Link setRelation(Relation relation) { - this.relation = relation; - return this; - } - - public RelatedEntity getRelatedEntity() { - return relatedEntity; - } - - public Link setRelatedEntity(RelatedEntity relatedEntity) { - this.relatedEntity = relatedEntity; - return this; - } -} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntity.java index c7c2d1892..9e6fc0d38 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntity.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntity.java @@ -1,41 +1,28 @@ package eu.dnetlib.dhp.graph; -import eu.dnetlib.dhp.schema.oaf.OafEntity; - import java.io.Serializable; import java.util.List; public class LinkedEntity implements Serializable { - private String type; + private TypedRow entity; - private OafEntity entity; + private List links; - private List links; - - public String getType() { - return type; - } - - public LinkedEntity setType(String type) { - this.type = type; - return this; - } - - public OafEntity getEntity() { + public TypedRow getEntity() { return entity; } - public LinkedEntity setEntity(OafEntity entity) { + public LinkedEntity setEntity(TypedRow entity) { this.entity = entity; return this; } - public List getLinks() { + public List getLinks() { return links; } - public LinkedEntity setLinks(List links) { + public LinkedEntity setLinks(List links) { this.links = links; return this; } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntityWrapper.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntityWrapper.java deleted file mode 100644 index 17853208c..000000000 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntityWrapper.java +++ /dev/null @@ -1,40 +0,0 @@ -package eu.dnetlib.dhp.graph; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import java.io.IOException; -import java.io.Serializable; -import java.util.List; - -public class LinkedEntityWrapper implements Serializable { - - private TypedRow entity; - - private List links; - - public static LinkedEntityWrapper parse(final String s) { - try { - return new ObjectMapper().readValue(s, LinkedEntityWrapper.class); - } catch (IOException e) { - throw new IllegalArgumentException("unable to decode LinkedEntityWrapper: " + s); - } - } - - public TypedRow getEntity() { - return entity; - } - - public LinkedEntityWrapper setEntity(TypedRow entity) { - this.entity = entity; - return this; - } - - public List getLinks() { - return links; - } - - public LinkedEntityWrapper setLinks(List links) { - this.links = links; - return this; - } -} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlRecordBuilderJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlRecordBuilderJob.java index 38bc2bae2..2a518eb92 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlRecordBuilderJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlRecordBuilderJob.java @@ -34,7 +34,8 @@ public class SparkXmlRecordBuilderJob { } new GraphJoiner(spark, inputPath, outputPath) - .adjacencyLists(); + .adjacencyLists() + .asXML(); } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Tuple.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Tuple.java new file mode 100644 index 000000000..1eb0491a7 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Tuple.java @@ -0,0 +1,29 @@ +package eu.dnetlib.dhp.graph; + +import java.io.Serializable; + +public class Tuple implements Serializable { + + private TypedRow relation; + + private TypedRow target; + + + public TypedRow getRelation() { + return relation; + } + + public Tuple setRelation(TypedRow relation) { + this.relation = relation; + return this; + } + + public TypedRow getTarget() { + return target; + } + + public Tuple setTarget(TypedRow target) { + this.target = target; + return this; + } +} From 49ef2f4eb1d3d64c98b242b028e097e78044c1b6 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 30 Jan 2020 18:20:26 +0100 Subject: [PATCH 20/45] removed input parameter specification, SparkXmlRecordBuilderJob doesn't need hive --- .../resources/eu/dnetlib/dhp/graph/input_graph_parameters.json | 1 - 1 file changed, 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json index 3a02ab1a0..cbd4285bf 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json @@ -1,6 +1,5 @@ [ {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, - {"paramName":"h", "paramLongName":"hive_metastore_uris","paramDescription": "the hive metastore uris", "paramRequired": true}, {"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the path used to store temporary output files", "paramRequired": true}, {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true} ] \ No newline at end of file From 7ba0f44d0583d441c7d12bc30cabfc0f72a25d2c Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 30 Jan 2020 18:21:07 +0100 Subject: [PATCH 21/45] WIP --- .../java/eu/dnetlib/dhp/graph/SparkXmlRecordBuilderJob.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlRecordBuilderJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlRecordBuilderJob.java index 2a518eb92..38bc2bae2 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlRecordBuilderJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlRecordBuilderJob.java @@ -34,8 +34,7 @@ public class SparkXmlRecordBuilderJob { } new GraphJoiner(spark, inputPath, outputPath) - .adjacencyLists() - .asXML(); + .adjacencyLists(); } } From ed290ca8d77668d3a9b215cd0ec26e58f2871ac1 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 3 Feb 2020 10:35:51 +0100 Subject: [PATCH 22/45] builder pattern --- .../eu/dnetlib/dhp/schema/oaf/Datasource.java | 4 +-- .../java/eu/dnetlib/dhp/schema/oaf/Field.java | 6 +++-- .../eu/dnetlib/dhp/schema/oaf/Instance.java | 26 ++++++++++++------- 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java index f52a500fe..032468de2 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java @@ -40,9 +40,9 @@ public class Datasource extends OafEntity implements Serializable { private List> odlanguages; - private List< Field> odcontenttypes; + private List> odcontenttypes; - private List< Field> accessinfopackage; + private List> accessinfopackage; // re3data fields private Field releasestartdate; diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java index 2ab0b4d3c..a75ed25c2 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java @@ -12,16 +12,18 @@ public class Field implements Serializable { return value; } - public void setValue(T value) { + public Field setValue(T value) { this.value = value; + return this; } public DataInfo getDataInfo() { return dataInfo; } - public void setDataInfo(DataInfo dataInfo) { + public Field setDataInfo(DataInfo dataInfo) { this.dataInfo = dataInfo; + return this; } @Override diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java index 8f852af65..06c907ee7 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java @@ -26,68 +26,74 @@ public class Instance implements Serializable { return license; } - public void setLicense(Field license) { + public Instance setLicense(Field license) { this.license = license; + return this; } public Qualifier getAccessright() { return accessright; } - public void setAccessright(Qualifier accessright) { + public Instance setAccessright(Qualifier accessright) { this.accessright = accessright; + return this; } public Qualifier getInstancetype() { return instancetype; } - public void setInstancetype(Qualifier instancetype) { + public Instance setInstancetype(Qualifier instancetype) { this.instancetype = instancetype; + return this; } public KeyValue getHostedby() { return hostedby; } - public void setHostedby(KeyValue hostedby) { + public Instance setHostedby(KeyValue hostedby) { this.hostedby = hostedby; + return this; } public List getUrl() { return url; } - public void setUrl(List url) { + public Instance setUrl(List url) { this.url = url; + return this; } public String getDistributionlocation() { return distributionlocation; } - public void setDistributionlocation(String distributionlocation) { + public Instance setDistributionlocation(String distributionlocation) { this.distributionlocation = distributionlocation; + return this; } public KeyValue getCollectedfrom() { return collectedfrom; } - public void setCollectedfrom(KeyValue collectedfrom) { + public Instance setCollectedfrom(KeyValue collectedfrom) { this.collectedfrom = collectedfrom; + return this; } public Field getDateofacceptance() { return dateofacceptance; } - public void setDateofacceptance(Field dateofacceptance) { + public Instance setDateofacceptance(Field dateofacceptance) { this.dateofacceptance = dateofacceptance; + return this; } - - public String toComparableString(){ return String.format("%s::%s::%s::%s", hostedby != null && hostedby.getKey()!= null ? hostedby.getKey().toLowerCase() : "", From d3b96f102b7ba953931c6681fa0b7fdbcc26c102 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 4 Feb 2020 14:10:58 +0100 Subject: [PATCH 23/45] builder pattern screws up the Parquet schema inference method, avoid using it in the bean definitions --- .../java/eu/dnetlib/dhp/schema/oaf/Field.java | 6 ++-- .../eu/dnetlib/dhp/schema/oaf/Instance.java | 24 +++++-------- .../eu/dnetlib/dhp/schema/oaf/Qualifier.java | 12 +++---- .../eu/dnetlib/dhp/schema/oaf/Result.java | 34 +++++++++---------- 4 files changed, 30 insertions(+), 46 deletions(-) diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java index a75ed25c2..2ab0b4d3c 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java @@ -12,18 +12,16 @@ public class Field implements Serializable { return value; } - public Field setValue(T value) { + public void setValue(T value) { this.value = value; - return this; } public DataInfo getDataInfo() { return dataInfo; } - public Field setDataInfo(DataInfo dataInfo) { + public void setDataInfo(DataInfo dataInfo) { this.dataInfo = dataInfo; - return this; } @Override diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java index 06c907ee7..fe882cc1b 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java @@ -26,72 +26,64 @@ public class Instance implements Serializable { return license; } - public Instance setLicense(Field license) { + public void setLicense(Field license) { this.license = license; - return this; } public Qualifier getAccessright() { return accessright; } - public Instance setAccessright(Qualifier accessright) { + public void setAccessright(Qualifier accessright) { this.accessright = accessright; - return this; } public Qualifier getInstancetype() { return instancetype; } - public Instance setInstancetype(Qualifier instancetype) { + public void setInstancetype(Qualifier instancetype) { this.instancetype = instancetype; - return this; } public KeyValue getHostedby() { return hostedby; } - public Instance setHostedby(KeyValue hostedby) { + public void setHostedby(KeyValue hostedby) { this.hostedby = hostedby; - return this; } public List getUrl() { return url; } - public Instance setUrl(List url) { + public void setUrl(List url) { this.url = url; - return this; } public String getDistributionlocation() { return distributionlocation; } - public Instance setDistributionlocation(String distributionlocation) { + public void setDistributionlocation(String distributionlocation) { this.distributionlocation = distributionlocation; - return this; } public KeyValue getCollectedfrom() { return collectedfrom; } - public Instance setCollectedfrom(KeyValue collectedfrom) { + public void setCollectedfrom(KeyValue collectedfrom) { this.collectedfrom = collectedfrom; - return this; } public Field getDateofacceptance() { return dateofacceptance; } - public Instance setDateofacceptance(Field dateofacceptance) { + public void setDateofacceptance(Field dateofacceptance) { this.dateofacceptance = dateofacceptance; - return this; } public String toComparableString(){ diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java index 00ae88c52..ae2bf1a60 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java @@ -16,36 +16,32 @@ public class Qualifier implements Serializable { return classid; } - public Qualifier setClassid(String classid) { + public void setClassid(String classid) { this.classid = classid; - return this; } public String getClassname() { return classname; } - public Qualifier setClassname(String classname) { + public void setClassname(String classname) { this.classname = classname; - return this; } public String getSchemeid() { return schemeid; } - public Qualifier setSchemeid(String schemeid) { + public void setSchemeid(String schemeid) { this.schemeid = schemeid; - return this; } public String getSchemename() { return schemename; } - public Qualifier setSchemename(String schemename) { + public void setSchemename(String schemename) { this.schemename = schemename; - return this; } public String toComparableString() { diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java index eb5572ce1..4554d353c 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java @@ -210,6 +210,22 @@ public abstract class Result extends OafEntity implements Serializable { this.context = context; } + public Field getProcessingchargeamount() { + return processingchargeamount; + } + + public void setProcessingchargeamount(Field processingchargeamount) { + this.processingchargeamount = processingchargeamount; + } + + public Field getProcessingchargecurrency() { + return processingchargecurrency; + } + + public void setProcessingchargecurrency(Field processingchargecurrency) { + this.processingchargecurrency = processingchargecurrency; + } + public List getExternalReference() { return externalReference; } @@ -226,24 +242,6 @@ public abstract class Result extends OafEntity implements Serializable { this.instance = instance; } - public Field getProcessingchargeamount() { - return processingchargeamount; - } - - public Result setProcessingchargeamount(Field processingchargeamount) { - this.processingchargeamount = processingchargeamount; - return this; - } - - public Field getProcessingchargecurrency() { - return processingchargecurrency; - } - - public Result setProcessingchargecurrency(Field processingchargecurrency) { - this.processingchargecurrency = processingchargecurrency; - return this; - } - @Override public void mergeFrom(OafEntity e) { super.mergeFrom(e); From fbb0fc140b7f8b5e3d16c78d7df8fdbd92e8b5f3 Mon Sep 17 00:00:00 2001 From: Michele Artini Date: Tue, 4 Feb 2020 15:25:47 +0100 Subject: [PATCH 24/45] partial implementation of migration --- ...on.java => AbstractMigrationExecutor.java} | 4 +- .../dhp/migration/AbstractMongoExecutor.java | 369 ++++++++++++++++++ .../MigrateDbEntitiesApplication.java | 2 +- .../MigrateMongoMdstoresApplication.java | 339 +--------------- .../dnetlib/dhp/migration/MigrationUtils.java | 154 -------- .../dhp/migration/OafMigrationExecutor.java | 246 ++++++++++++ 6 files changed, 626 insertions(+), 488 deletions(-) rename dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/{AbstractMigrateApplication.java => AbstractMigrationExecutor.java} (97%) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java delete mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrationUtils.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrateApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java similarity index 97% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrateApplication.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java index 73ee7f822..389790511 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrateApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java @@ -30,7 +30,7 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.utils.DHPUtils; -public class AbstractMigrateApplication implements Closeable { +public class AbstractMigrationExecutor implements Closeable { private final AtomicInteger counter = new AtomicInteger(0); @@ -42,7 +42,7 @@ public class AbstractMigrateApplication implements Closeable { private final SequenceFile.Writer writer; - public AbstractMigrateApplication(final String hdfsPath, final String hdfsNameNode, final String hdfsUser) throws Exception { + public AbstractMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser) throws Exception { this.writer = SequenceFile.createWriter(getConf(hdfsNameNode, hdfsUser), SequenceFile.Writer.file(new Path(hdfsPath)), SequenceFile.Writer .keyClass(IntWritable.class), SequenceFile.Writer.valueClass(Text.class)); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java new file mode 100644 index 000000000..51c39824a --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java @@ -0,0 +1,369 @@ +package eu.dnetlib.dhp.migration; + +import java.io.IOException; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.commons.lang3.StringUtils; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.DocumentFactory; +import org.dom4j.DocumentHelper; +import org.dom4j.Node; + +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.GeoLocation; +import eu.dnetlib.dhp.schema.oaf.Instance; +import eu.dnetlib.dhp.schema.oaf.Journal; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.OAIProvenance; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.Software; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { + + protected final Map code2name = new HashMap<>(); + + protected final MdstoreClient mdstoreClient; + + protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); + + protected static final Qualifier PUBLICATION_RESULTTYPE_QUALIFIER = + qualifier("publication", "publication", "dnet:result_typologies", "dnet:result_typologies"); + protected static final Qualifier DATASET_RESULTTYPE_QUALIFIER = qualifier("dataset", "dataset", "dnet:result_typologies", "dnet:result_typologies"); + protected static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies"); + protected static final Qualifier OTHER_RESULTTYPE_QUALIFIER = qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies"); + + public AbstractMongoExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, + final String mongoDb, final String dbUrl, final String dbUser, + final String dbPassword) throws Exception { + + super(hdfsPath, hdfsNameNode, hdfsUser); + + this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb); + loadClassNames(dbUrl, dbUser, dbPassword); + + final Map nsContext = new HashMap<>(); + + registerNamespaces(nsContext); + nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); + nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); + nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); + nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); + nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); + nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); + DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); + } + + private void loadClassNames(final String dbUrl, final String dbUser, final String dbPassword) throws IOException { + try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) { + code2name.clear(); + dbClient.processResults("select code, name from class", rs -> { + try { + code2name.put(rs.getString("code"), rs.getString("name")); + } catch (final SQLException e) { + e.printStackTrace(); + } + }); + } + + } + + public void processMdRecords(final String mdFormat, final String mdLayout, final String mdInterpretation) throws DocumentException { + + for (final Entry entry : mdstoreClient.validCollections(mdFormat, mdLayout, mdInterpretation).entrySet()) { + // final String mdId = entry.getKey(); + final String currentColl = entry.getValue(); + + for (final String xml : mdstoreClient.listRecords(currentColl)) { + final Document doc = DocumentHelper.parseText(xml); + + final String type = doc.valueOf("//dr:CobjCategory/@type"); + final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name")); + final DataInfo info = prepareDataInfo(doc); + final long lastUpdateTimestamp = new Date().getTime(); + + for (final Oaf oaf : createOafs(doc, type, collectedFrom, info, lastUpdateTimestamp)) { + emitOaf(oaf); + } + } + } + } + + protected abstract void registerNamespaces(Map nsContext); + + protected List createOafs(final Document doc, final String type, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) { + + final List oafs = new ArrayList<>(); + + switch (type.toLowerCase()) { + case "": + case "publication": + final Publication p = new Publication(); + populateResultFields(p, doc, collectedFrom, info, lastUpdateTimestamp); + p.setResulttype(PUBLICATION_RESULTTYPE_QUALIFIER); + p.setJournal(prepareJournal(doc, info)); + oafs.add(p); + break; + case "dataset": + final Dataset d = new Dataset(); + populateResultFields(d, doc, collectedFrom, info, lastUpdateTimestamp); + d.setResulttype(DATASET_RESULTTYPE_QUALIFIER); + d.setStoragedate(prepareDatasetStorageDate(doc, info)); + d.setDevice(prepareDatasetDevice(doc, info)); + d.setSize(prepareDatasetSize(doc, info)); + d.setVersion(prepareDatasetVersion(doc, info)); + d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); + d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); + d.setGeolocation(prepareDatasetGeoLocations(doc, info)); + oafs.add(d); + break; + case "software": + final Software s = new Software(); + populateResultFields(s, doc, collectedFrom, info, lastUpdateTimestamp); + s.setResulttype(SOFTWARE_RESULTTYPE_QUALIFIER); + s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); + s.setLicense(prepareSoftwareLicenses(doc, info)); + s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); + s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); + oafs.add(s); + break; + case "otherresearchproducts": + default: + final OtherResearchProduct o = new OtherResearchProduct(); + populateResultFields(o, doc, collectedFrom, info, lastUpdateTimestamp); + o.setResulttype(OTHER_RESULTTYPE_QUALIFIER); + o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); + o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); + o.setTool(prepareOtherResearchProductTools(doc, info)); + oafs.add(o); + break; + } + + if (!oafs.isEmpty()) { + addRelations(oafs, doc, "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO + addRelations(oafs, doc, "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO + addRelations(oafs, doc, "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO + } + + return oafs; + } + + private void populateResultFields(final Result r, final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) { + r.setDataInfo(info); + r.setLastupdatetimestamp(lastUpdateTimestamp); + r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"))); + r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); + r.setCollectedfrom(Arrays.asList(collectedFrom)); + r.setPid(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); + r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); + r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation")); + r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setOaiprovenance(prepareOAIprovenance(doc)); + r.setAuthor(prepareAuthors(doc, info)); + r.setLanguage(prepareLanguages(doc)); + r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setSubject(prepareSubjects(doc, info)); + r.setTitle(prepareTitles(doc, info)); + r.setRelevantdate(prepareRelevantDates(doc, info)); + r.setDescription(prepareDescriptions(doc, info)); + r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info)); + r.setPublisher(preparePublisher(doc, info)); + r.setEmbargoenddate(prepareEmbargoEndDate(doc, info)); + r.setSource(prepareSources(doc, info)); + r.setFulltext(null); // NOT PRESENT IN MDSTORES + r.setFormat(prepareFormats(doc, info)); + r.setContributor(prepareContributors(doc, info)); + r.setResourcetype(null); // TODO + r.setCoverage(prepareCoverages(doc, info)); + r.setRefereed(null); // TODO + r.setContext(null); // TODO + r.setExternalReference(null); // TODO + r.setInstance(prepareInstances(doc, info)); + r.setProcessingchargeamount(null); // TODO + r.setProcessingchargecurrency(null); // TODO + } + + protected abstract List prepareInstances(Document doc, DataInfo info); + + protected abstract List> prepareSources(Document doc, DataInfo info); + + protected abstract Field prepareEmbargoEndDate(Document doc, DataInfo info); + + protected abstract List prepareRelevantDates(Document doc, DataInfo info); + + protected abstract List> prepareCoverages(Document doc, DataInfo info); + + protected abstract List> prepareContributors(Document doc, DataInfo info); + + protected abstract List> prepareFormats(Document doc, DataInfo info); + + protected abstract Field preparePublisher(Document doc, DataInfo info); + + protected abstract List> prepareDescriptions(Document doc, DataInfo info); + + protected abstract List prepareTitles(Document doc, DataInfo info); + + protected abstract List prepareSubjects(Document doc, DataInfo info); + + protected abstract Qualifier prepareLanguages(Document doc); + + protected abstract List prepareAuthors(Document doc, DataInfo info); + + protected abstract List> prepareOtherResearchProductTools(Document doc, DataInfo info); + + protected abstract List> prepareOtherResearchProductContactGroups(Document doc, DataInfo info); + + protected abstract List> prepareOtherResearchProductContactPersons(Document doc, DataInfo info); + + protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info); + + protected abstract Field prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info); + + protected abstract List prepareSoftwareLicenses(Document doc, DataInfo info); + + protected abstract List> prepareSoftwareDocumentationUrls(Document doc, DataInfo info); + + protected abstract List prepareDatasetGeoLocations(Document doc, DataInfo info); + + protected abstract Field prepareDatasetMetadataVersionNumber(Document doc, DataInfo info); + + protected abstract Field prepareDatasetLastMetadataUpdate(Document doc, DataInfo info); + + protected abstract Field prepareDatasetVersion(Document doc, DataInfo info); + + protected abstract Field prepareDatasetSize(Document doc, DataInfo info); + + protected abstract Field prepareDatasetDevice(Document doc, DataInfo info); + + protected abstract Field prepareDatasetStorageDate(Document doc, DataInfo info); + + abstract protected void addRelations(final List oafs, + final Document doc, + final String type, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp); + + private Journal prepareJournal(final Document doc, final DataInfo info) { + final Node n = doc.selectSingleNode("//oaf:journal"); + if (n != null) { + final String name = n.getText(); + final String issnPrinted = n.valueOf("@issn"); + final String issnOnline = n.valueOf("@eissn"); + final String issnLinking = n.valueOf("@lissn"); + if (StringUtils.isNotBlank(name)) { return journal(name, issnPrinted, issnOnline, issnLinking, null, null, null, null, null, null, null, info); } + } + return null; + } + + protected Qualifier prepareQualifier(final Document doc, final String xpath, final String schemeId, final String schemeName) { + final String classId = doc.valueOf(xpath); + final String className = code2name.get(classId); + return qualifier(classId, className, schemeId, schemeName); + } + + protected List prepareListStructProps(final Document doc, + final String xpath, + final String xpathClassId, + final String schemeId, + final String schemeName, + final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes(xpath)) { + final Node n = (Node) o; + final String classId = n.valueOf(xpathClassId); + final String className = code2name.get(classId); + res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info)); + } + return res; + } + + protected List prepareListStructProps(final Document doc, final String xpath, final Qualifier qualifier, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes(xpath)) { + final Node n = (Node) o; + res.add(structuredProperty(n.getText(), qualifier, info)); + } + return res; + } + + protected List prepareListStructProps(final Document doc, final String xpath, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes(xpath)) { + final Node n = (Node) o; + res.add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n + .valueOf("@schemename"), info)); + } + return res; + } + + protected OAIProvenance prepareOAIprovenance(final Document doc) { + final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); + + final String identifier = n.valueOf("./*[local-name()='identifier']"); + final String baseURL = n.valueOf("./*[local-name()='baseURL']");; + final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']");; + final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true"); + final String datestamp = n.valueOf("./*[local-name()='datestamp']");; + final String harvestDate = n.valueOf("@harvestDate");; + + return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate); + } + + protected DataInfo prepareDataInfo(final Document doc) { + final Node n = doc.selectSingleNode("//oaf:datainfo"); + + final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); + final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); + final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid"); + final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename"); + + final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference")); + final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance"); + final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); + final String trust = n.valueOf("./oaf:trust"); + + return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); + } + + protected Field prepareField(final Document doc, final String xpath, final DataInfo info) { + return field(doc.valueOf(xpath), info); + } + + protected List> prepareListFields(final Document doc, final String xpath, final DataInfo info) { + return listFields(info, (String[]) prepareListString(doc, xpath).toArray()); + } + + protected List prepareListString(final Document doc, final String xpath) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes(xpath)) { + final String s = ((Node) o).getText().trim(); + if (StringUtils.isNotBlank(s)) { + res.add(s); + } + } + return res; + } + + @Override + public void close() throws IOException { + super.close(); + mdstoreClient.close(); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java index 0b47c5282..12043709f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java @@ -28,7 +28,7 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -public class MigrateDbEntitiesApplication extends AbstractMigrateApplication implements Closeable { +public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor implements Closeable { private static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION = qualifier("sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", "dnet:provenance_actions", "dnet:provenance_actions"); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java index f6dcaf0e8..124a4f3cc 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java @@ -1,56 +1,10 @@ package eu.dnetlib.dhp.migration; -import java.io.Closeable; -import java.io.IOException; -import java.sql.SQLException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Date; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; - import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.dom4j.Document; -import org.dom4j.DocumentException; -import org.dom4j.DocumentFactory; -import org.dom4j.DocumentHelper; -import org.dom4j.Node; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.DataInfo; -import eu.dnetlib.dhp.schema.oaf.Dataset; -import eu.dnetlib.dhp.schema.oaf.Field; -import eu.dnetlib.dhp.schema.oaf.KeyValue; -import eu.dnetlib.dhp.schema.oaf.OAIProvenance; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.schema.oaf.Qualifier; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.oaf.Result; -import eu.dnetlib.dhp.schema.oaf.Software; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -public class MigrateMongoMdstoresApplication extends AbstractMigrateApplication implements Closeable { - - private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class); - - private final Map code2name = new HashMap<>(); - - private final MdstoreClient mdstoreClient; - - private static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); - - private static final Qualifier PUBLICATION_RESULTTYPE_QUALIFIER = - qualifier("publication", "publication", "dnet:result_typologies", "dnet:result_typologies"); - private static final Qualifier DATASET_RESULTTYPE_QUALIFIER = qualifier("dataset", "dataset", "dnet:result_typologies", "dnet:result_typologies"); - private static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies"); - private static final Qualifier OTHER_RESULTTYPE_QUALIFIER = qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies"); +public class MigrateMongoMdstoresApplication { public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( @@ -72,294 +26,17 @@ public class MigrateMongoMdstoresApplication extends AbstractMigrateApplication final String dbUser = parser.get("postgresUser"); final String dbPassword = parser.get("postgresPassword"); - try (final MigrateMongoMdstoresApplication mig = - new MigrateMongoMdstoresApplication(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) { - mig.processMdRecords(mdFormat, mdLayout, mdInterpretation); - } - - } - - public MigrateMongoMdstoresApplication(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, - final String mongoDb, final String dbUrl, final String dbUser, - final String dbPassword) throws Exception { - super(hdfsPath, hdfsNameNode, hdfsUser); - - this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb); - loadClassNames(dbUrl, dbUser, dbPassword); - - final Map nsContext = new HashMap<>(); - nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); - nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); - nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); - nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); - nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); - nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); - DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); - } - - private void loadClassNames(final String dbUrl, final String dbUser, final String dbPassword) throws IOException { - try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) { - code2name.clear(); - dbClient.processResults("select code, name from class", rs -> { - try { - code2name.put(rs.getString("code"), rs.getString("name")); - } catch (final SQLException e) { - e.printStackTrace(); - } - }); - } - - } - - public void processMdRecords(final String mdFormat, final String mdLayout, final String mdInterpretation) throws DocumentException { - - for (final Entry entry : mdstoreClient.validCollections(mdFormat, mdLayout, mdInterpretation).entrySet()) { - // final String mdId = entry.getKey(); - final String currentColl = entry.getValue(); - - for (final String xml : mdstoreClient.listRecords(currentColl)) { - for (final Oaf oaf : createOafs(xml)) { - emitOaf(oaf); - } + if (mdFormat.equalsIgnoreCase("oaf")) { + try (final OafMigrationExecutor mig = + new OafMigrationExecutor(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) { + mig.processMdRecords(mdFormat, mdLayout, mdInterpretation); } - } - } + } else if (mdFormat.equalsIgnoreCase("oaf")) { - private List createOafs(final String xml) throws DocumentException { - - final Document doc = DocumentHelper.parseText(xml); - - final String type = doc.valueOf("//dr:CobjCategory/@type"); - final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name")); - final DataInfo info = prepareDataInfo(doc); - final long lastUpdateTimestamp = new Date().getTime(); - - final List oafs = new ArrayList<>(); - - switch (type.toLowerCase()) { - case "": - case "publication": - final Publication p = new Publication(); - populateResultFields(p, doc, collectedFrom, info, lastUpdateTimestamp); - p.setResulttype(PUBLICATION_RESULTTYPE_QUALIFIER); - p.setJournal(null); // TODO - oafs.add(p); - break; - case "dataset": - final Dataset d = new Dataset(); - populateResultFields(d, doc, collectedFrom, info, lastUpdateTimestamp); - d.setResulttype(DATASET_RESULTTYPE_QUALIFIER); - d.setStoragedate(null); // TODO - d.setDevice(null); // TODO - d.setSize(null); // TODO - d.setVersion(null); // TODO - d.setLastmetadataupdate(null); // TODO - d.setMetadataversionnumber(null); // TODO - d.setGeolocation(null); // TODO - oafs.add(d); - break; - case "otherresearchproducts": - - case "software": - final Software s = new Software(); - populateResultFields(s, doc, collectedFrom, info, lastUpdateTimestamp); - s.setResulttype(SOFTWARE_RESULTTYPE_QUALIFIER); - s.setDocumentationUrl(null); // TODO - s.setLicense(null); // TODO - s.setCodeRepositoryUrl(null); // TODO - s.setProgrammingLanguage(null); // TODO - oafs.add(s); - break; - default: - final OtherResearchProduct o = new OtherResearchProduct(); - populateResultFields(o, doc, collectedFrom, info, lastUpdateTimestamp); - o.setResulttype(OTHER_RESULTTYPE_QUALIFIER); - o.setContactperson(null); // TODO - o.setContactgroup(null); // TODO - o.setTool(null); // TODO - oafs.add(o); - break; + } else { + throw new RuntimeException("Format not supported: " + mdFormat); } - if (!oafs.isEmpty()) { - addRelations(oafs, doc, "//*", "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO - addRelations(oafs, doc, "//*", "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO - addRelations(oafs, doc, "//*", "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO - } - - return oafs; - } - - private void addRelations(final List oafs, - final Document doc, - final String xpath, - final String type, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { - for (final Object o : doc.selectNodes(xpath)) { - final Node n = (Node) o; - final Relation r = new Relation(); - r.setRelType(null); // TODO - r.setSubRelType(null); // TODO - r.setRelClass(null); // TODO - r.setSource(null); // TODO - r.setTarget(null); // TODO - r.setCollectedFrom(Arrays.asList(collectedFrom)); - r.setDataInfo(info); - r.setLastupdatetimestamp(lastUpdateTimestamp); - oafs.add(r); - } - - } - - private void populateResultFields(final Result r, final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) { - - r.setDataInfo(info); - r.setLastupdatetimestamp(lastUpdateTimestamp); - r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"))); - r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); - r.setCollectedfrom(Arrays.asList(collectedFrom)); - r.setPid(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); - r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); - r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation")); - r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setOaiprovenance(prepareOAIprovenance(doc)); - r.setAuthor(null); // TODO - r.setLanguage(prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages")); - r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setSubject(prepareListStructProps(doc, "//dc:subject", info)); - r.setTitle(prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info)); - r.setRelevantdate(null); // TODO - r.setDescription(prepareListFields(doc, "//dc:description", info)); - r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info)); - r.setPublisher(prepareField(doc, "//dc:publisher", info)); - r.setEmbargoenddate(null); // TODO - r.setSource(null); // TODO - r.setFulltext(null); // TODO - r.setFormat(prepareListFields(doc, "//dc:format", info)); - r.setContributor(prepareListFields(doc, "//dc:contributor", info)); - r.setResourcetype(null); // TODO - r.setCoverage(prepareListFields(doc, "//dc:coverage", info)); - r.setRefereed(null); // TODO - r.setContext(null); // TODO - r.setExternalReference(null); // TODO - r.setInstance(null); // TODO - r.setProcessingchargeamount(null); // TODO - r.setProcessingchargecurrency(null); // TODO - } - - private Qualifier prepareQualifier(final Document doc, final String xpath, final String schemeId, final String schemeName) { - final String classId = doc.valueOf(xpath); - final String className = code2name.get(classId); - return qualifier(classId, className, schemeId, schemeName); - } - - private List prepareListStructProps(final Document doc, - final String xpath, - final String xpathClassId, - final String schemeId, - final String schemeName, - final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes(xpath)) { - final Node n = (Node) o; - final String classId = n.valueOf(xpathClassId); - final String className = code2name.get(classId); - res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info)); - } - return res; - } - - private List prepareListStructProps(final Document doc, final String xpath, final Qualifier qualifier, final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes(xpath)) { - final Node n = (Node) o; - res.add(structuredProperty(n.getText(), qualifier, info)); - } - return res; - } - - private List prepareListStructProps(final Document doc, final String xpath, final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes(xpath)) { - final Node n = (Node) o; - res.add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n - .valueOf("@schemename"), info)); - } - return res; - } - - private OAIProvenance prepareOAIprovenance(final Document doc) { - final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); - - final String identifier = n.valueOf("./*[local-name()='identifier']"); - final String baseURL = n.valueOf("./*[local-name()='baseURL']");; - final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']");; - final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true"); - final String datestamp = n.valueOf("./*[local-name()='datestamp']");; - final String harvestDate = n.valueOf("@harvestDate");; - - return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate); - } - - private DataInfo prepareDataInfo(final Document doc) { - final Node n = doc.selectSingleNode("//oaf:datainfo"); - - final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); - final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); - final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid"); - final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename"); - - final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference")); - final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance"); - final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); - final String trust = n.valueOf("./oaf:trust"); - - return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); - } - - private Field prepareField(final Document doc, final String xpath, final DataInfo info) { - return field(doc.valueOf(xpath), info); - } - - private List> prepareListFields(final Document doc, final String xpath, final DataInfo info) { - return listFields(info, (String[]) prepareListString(doc, xpath).toArray()); - } - - private List prepareListString(final Document doc, final String xpath) { - final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes(xpath)) { - final String s = ((Node) o).getText().trim(); - if (StringUtils.isNotBlank(s)) { - res.add(s); - } - } - return res; - } - /* - * private StructuredProperty prepareStructProp(final Document doc, final String xpath, final DataInfo dataInfo) { if - * (StringUtils.isBlank(s)) { return null; } final String[] parts = s.split("###"); if (parts.length == 2) { final String value = - * parts[0]; final String[] arr = parts[1].split("@@@"); if (arr.length == 4) { return structuredProperty(value, arr[0], arr[1], arr[2], - * arr[3], dataInfo); } } return null; } - * - * private List prepareListOfStructProps(final Document doc, final String xpath, final DataInfo dataInfo) { final - * List res = new ArrayList<>(); if (array != null) { for (final String s : (String[]) array.getArray()) { final - * StructuredProperty sp = prepareStructProp(s, dataInfo); if (sp != null) { res.add(sp); } } } - * - * return res; } - * - * private Journal prepareJournal(final Document doc, final String xpath, final DataInfo info) { if (StringUtils.isNotBlank(sj)) { final - * String[] arr = sj.split("@@@"); if (arr.length == 3) { final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0] : null; final - * String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1] : null;; final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2] : null;; - * if (issn != null || eissn != null || lissn != null) { return journal(name, issn, eissn, eissn, null, null, null, null, null, null, - * null, info); } } } return null; } - */ - - @Override - public void close() throws IOException { - super.close(); - mdstoreClient.close(); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrationUtils.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrationUtils.java deleted file mode 100644 index c58688a79..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrationUtils.java +++ /dev/null @@ -1,154 +0,0 @@ -package eu.dnetlib.dhp.migration; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; - -import org.apache.commons.lang3.StringUtils; - -import eu.dnetlib.dhp.schema.oaf.DataInfo; -import eu.dnetlib.dhp.schema.oaf.ExtraInfo; -import eu.dnetlib.dhp.schema.oaf.Field; -import eu.dnetlib.dhp.schema.oaf.Journal; -import eu.dnetlib.dhp.schema.oaf.KeyValue; -import eu.dnetlib.dhp.schema.oaf.OAIProvenance; -import eu.dnetlib.dhp.schema.oaf.OriginDescription; -import eu.dnetlib.dhp.schema.oaf.Qualifier; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import eu.dnetlib.dhp.utils.DHPUtils; - -public class MigrationUtils { - - public static KeyValue keyValue(final String k, final String v) { - final KeyValue kv = new KeyValue(); - kv.setKey(k); - kv.setValue(v); - return kv; - } - - public static List listKeyValues(final String... s) { - if (s.length % 2 > 0) { throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)"); } - - final List list = new ArrayList<>(); - for (int i = 0; i < s.length; i += 2) { - list.add(keyValue(s[i], s[i + 1])); - } - return list; - } - - public static Field field(final T value, final DataInfo info) { - final Field field = new Field<>(); - field.setValue(value); - field.setDataInfo(info); - return field; - } - - public static List> listFields(final DataInfo info, final String... values) { - return Arrays.stream(values).map(v -> field(v, info)).collect(Collectors.toList()); - } - - public static Qualifier qualifier(final String classid, final String classname, final String schemeid, final String schemename) { - final Qualifier q = new Qualifier(); - q.setClassid(classid); - q.setClassname(classname); - q.setSchemeid(schemeid); - q.setSchemename(schemename); - return q; - } - - public static StructuredProperty structuredProperty(final String value, - final String classid, - final String classname, - final String schemeid, - final String schemename, - final DataInfo dataInfo) { - final StructuredProperty sp = new StructuredProperty(); - sp.setValue(value); - sp.setQualifier(qualifier(classid, classname, schemeid, schemename)); - sp.setDataInfo(dataInfo); - return sp; - } - - public static ExtraInfo extraInfo(final String name, final String value, final String typology, final String provenance, final String trust) { - final ExtraInfo info = new ExtraInfo(); - info.setName(name); - info.setValue(value); - info.setTypology(typology); - info.setProvenance(provenance); - info.setTrust(trust); - return info; - } - - public static OAIProvenance oaiIProvenance(final String identifier, - final String baseURL, - final String metadataNamespace, - final Boolean altered, - final String datestamp, - final String harvestDate) { - - final OriginDescription desc = new OriginDescription(); - desc.setIdentifier(identifier); - desc.setBaseURL(baseURL); - desc.setMetadataNamespace(metadataNamespace); - desc.setAltered(altered); - desc.setDatestamp(datestamp); - desc.setHarvestDate(harvestDate); - - final OAIProvenance p = new OAIProvenance(); - p.setOriginDescription(desc); - - return p; - } - - public static Journal journal(final String name, - final String issnPrinted, - final String issnOnline, - final String issnLinking, - final String ep, - final String iss, - final String sp, - final String vol, - final String edition, - final String conferenceplace, - final String conferencedate, - final DataInfo dataInfo) { - final Journal j = new Journal(); - j.setName(name); - j.setIssnPrinted(issnPrinted); - j.setIssnOnline(issnOnline); - j.setIssnLinking(issnLinking); - j.setEp(ep); - j.setIss(iss); - j.setSp(sp); - j.setVol(vol); - j.setEdition(edition); - j.setConferenceplace(conferenceplace); - j.setConferencedate(conferencedate); - j.setDataInfo(dataInfo); - return j; - } - - public static DataInfo dataInfo(final Boolean deletedbyinference, - final String inferenceprovenance, - final Boolean inferred, - final Boolean invisible, - final Qualifier provenanceaction, - final String trust) { - final DataInfo d = new DataInfo(); - d.setDeletedbyinference(deletedbyinference); - d.setInferenceprovenance(inferenceprovenance); - d.setInferred(inferred); - d.setInvisible(invisible); - d.setProvenanceaction(provenanceaction); - d.setTrust(trust); - return d; - } - - public static String createOpenaireId(final String prefix, final String originalId) { - final String nsPrefix = StringUtils.substringBefore(originalId, "::"); - final String rest = StringUtils.substringAfter(originalId, "::"); - return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest)); - } - -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java new file mode 100644 index 000000000..4d222f360 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java @@ -0,0 +1,246 @@ +package eu.dnetlib.dhp.migration; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dom4j.Document; +import org.dom4j.Node; + +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.GeoLocation; +import eu.dnetlib.dhp.schema.oaf.Instance; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public class OafMigrationExecutor extends AbstractMongoExecutor { + + public OafMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, final String mongoDb, + final String dbUrl, final String dbUser, + final String dbPassword) throws Exception { + super(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword); + } + + private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class); + + @Override + protected void registerNamespaces(final Map nsContext) { + nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); + nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); + nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); + nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); + nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); + nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); + } + + @Override + protected void addRelations(final List oafs, + final Document doc, + final String type, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { + for (final Object o : doc.selectNodes("//")) { // TODO + final Node n = (Node) o; + final Relation r = new Relation(); + r.setRelType(null); // TODO + r.setSubRelType(null); // TODO + r.setRelClass(null); // TODO + r.setSource(null); // TODO + r.setTarget(null); // TODO + r.setCollectedFrom(Arrays.asList(collectedFrom)); + r.setDataInfo(info); + r.setLastupdatetimestamp(lastUpdateTimestamp); + oafs.add(r); + } + + } + + @Override + protected List prepareAuthors(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + int pos = 1; + for (final Object o : doc.selectNodes("//dc:creator")) { + final Node n = (Node) o; + final Author author = new Author(); + author.setFullname(n.getText()); + author.setRank(pos++); + } + return res; + } + + @Override + protected Qualifier prepareLanguages(final Document doc) { + return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages"); + } + + @Override + protected List prepareSubjects(final Document doc, final DataInfo info) { + return prepareListStructProps(doc, "//dc:subject", info); + } + + @Override + protected List prepareTitles(final Document doc, final DataInfo info) { + return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info); + } + + @Override + protected List> prepareDescriptions(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:description", info); + } + + @Override + protected Field preparePublisher(final Document doc, final DataInfo info) { + return prepareField(doc, "//dc:publisher", info); + } + + @Override + protected List> prepareFormats(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:format", info); + } + + @Override + protected List> prepareContributors(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:contributor", info); + } + + @Override + protected List> prepareCoverages(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:coverage", info); + } + + @Override + protected List prepareInstances(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List> prepareSources(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:source", info); + } + + @Override + protected Field prepareEmbargoEndDate(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List prepareRelevantDates(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List> prepareOtherResearchProductTools(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected Field prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List prepareSoftwareLicenses(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected Field prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected Field prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected Field prepareDatasetVersion(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected Field prepareDatasetSize(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected Field prepareDatasetDevice(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + /* + * private StructuredProperty prepareStructProp(final Document doc, final String xpath, final DataInfo dataInfo) { if + * (StringUtils.isBlank(s)) { return null; } final String[] parts = s.split("###"); if (parts.length == 2) { final String value = + * parts[0]; final String[] arr = parts[1].split("@@@"); if (arr.length == 4) { return structuredProperty(value, arr[0], arr[1], arr[2], + * arr[3], dataInfo); } } return null; } + * + * private List prepareListOfStructProps(final Document doc, final String xpath, final DataInfo dataInfo) { final + * List res = new ArrayList<>(); if (array != null) { for (final String s : (String[]) array.getArray()) { final + * StructuredProperty sp = prepareStructProp(s, dataInfo); if (sp != null) { res.add(sp); } } } + * + * return res; } + * + * private Journal prepareJournal(final Document doc, final String xpath, final DataInfo info) { if (StringUtils.isNotBlank(sj)) { final + * String[] arr = sj.split("@@@"); if (arr.length == 3) { final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0] : null; final + * String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1] : null;; final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2] : null;; + * if (issn != null || eissn != null || lissn != null) { return journal(name, issn, eissn, eissn, null, null, null, null, null, null, + * null, info); } } } return null; } + */ + +} From bb1533a07e0dbaac9e71bbcf29f523e69e9dcdc2 Mon Sep 17 00:00:00 2001 From: Michele Artini Date: Wed, 5 Feb 2020 15:35:40 +0100 Subject: [PATCH 25/45] partial commit --- .../dhp/migration/AbstractMongoExecutor.java | 38 +++- .../dhp/migration/OafMigrationExecutor.java | 82 +++---- .../dhp/migration/OdfMigrationExecutor.java | 209 ++++++++++++++++++ 3 files changed, 279 insertions(+), 50 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java index 51c39824a..cf1581b4d 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java @@ -94,10 +94,13 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { final String type = doc.valueOf("//dr:CobjCategory/@type"); final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name")); + final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) ? collectedFrom + : keyValue(doc.valueOf("//oaf:hostedBy/@id"), doc.valueOf("//oaf:hostedBy/@name")); + final DataInfo info = prepareDataInfo(doc); final long lastUpdateTimestamp = new Date().getTime(); - for (final Oaf oaf : createOafs(doc, type, collectedFrom, info, lastUpdateTimestamp)) { + for (final Oaf oaf : createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp)) { emitOaf(oaf); } } @@ -106,7 +109,12 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { protected abstract void registerNamespaces(Map nsContext); - protected List createOafs(final Document doc, final String type, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) { + protected List createOafs(final Document doc, + final String type, + final KeyValue collectedFrom, + final KeyValue hostedBy, + final DataInfo info, + final long lastUpdateTimestamp) { final List oafs = new ArrayList<>(); @@ -114,14 +122,14 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { case "": case "publication": final Publication p = new Publication(); - populateResultFields(p, doc, collectedFrom, info, lastUpdateTimestamp); + populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); p.setResulttype(PUBLICATION_RESULTTYPE_QUALIFIER); p.setJournal(prepareJournal(doc, info)); oafs.add(p); break; case "dataset": final Dataset d = new Dataset(); - populateResultFields(d, doc, collectedFrom, info, lastUpdateTimestamp); + populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); d.setResulttype(DATASET_RESULTTYPE_QUALIFIER); d.setStoragedate(prepareDatasetStorageDate(doc, info)); d.setDevice(prepareDatasetDevice(doc, info)); @@ -134,7 +142,7 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { break; case "software": final Software s = new Software(); - populateResultFields(s, doc, collectedFrom, info, lastUpdateTimestamp); + populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); s.setResulttype(SOFTWARE_RESULTTYPE_QUALIFIER); s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); s.setLicense(prepareSoftwareLicenses(doc, info)); @@ -145,7 +153,7 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { case "otherresearchproducts": default: final OtherResearchProduct o = new OtherResearchProduct(); - populateResultFields(o, doc, collectedFrom, info, lastUpdateTimestamp); + populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); o.setResulttype(OTHER_RESULTTYPE_QUALIFIER); o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); @@ -163,7 +171,12 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { return oafs; } - private void populateResultFields(final Result r, final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) { + private void populateResultFields(final Result r, + final Document doc, + final KeyValue collectedFrom, + final KeyValue hostedBy, + final DataInfo info, + final long lastUpdateTimestamp) { r.setDataInfo(info); r.setLastupdatetimestamp(lastUpdateTimestamp); r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"))); @@ -193,12 +206,12 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { r.setRefereed(null); // TODO r.setContext(null); // TODO r.setExternalReference(null); // TODO - r.setInstance(prepareInstances(doc, info)); + r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy)); r.setProcessingchargeamount(null); // TODO r.setProcessingchargecurrency(null); // TODO } - protected abstract List prepareInstances(Document doc, DataInfo info); + protected abstract List prepareInstances(Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); protected abstract List> prepareSources(Document doc, DataInfo info); @@ -266,7 +279,12 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { final String issnPrinted = n.valueOf("@issn"); final String issnOnline = n.valueOf("@eissn"); final String issnLinking = n.valueOf("@lissn"); - if (StringUtils.isNotBlank(name)) { return journal(name, issnPrinted, issnOnline, issnLinking, null, null, null, null, null, null, null, info); } + final String ep = n.valueOf("@ep"); + final String iss = n.valueOf("@iss"); + final String sp = n.valueOf("@sp"); + final String vol = n.valueOf("@vol"); + final String edition = n.valueOf("@edition"); + if (StringUtils.isNotBlank(name)) { return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); } } return null; } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java index 4d222f360..f46b31732 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java @@ -23,14 +23,14 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty; public class OafMigrationExecutor extends AbstractMongoExecutor { + private static final Log log = LogFactory.getLog(OafMigrationExecutor.class); + public OafMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, final String mongoDb, final String dbUrl, final String dbUser, final String dbPassword) throws Exception { super(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword); } - private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class); - @Override protected void registerNamespaces(final Map nsContext) { nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); @@ -73,6 +73,7 @@ public class OafMigrationExecutor extends AbstractMongoExecutor { final Author author = new Author(); author.setFullname(n.getText()); author.setRank(pos++); + res.add(author); } return res; } @@ -118,9 +119,24 @@ public class OafMigrationExecutor extends AbstractMongoExecutor { } @Override - protected List prepareInstances(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + protected List prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes("//dc:identifier")) { + final String url = ((Node) o).getText().trim(); + if (url.startsWith("http")) { + final Instance instance = new Instance(); + instance.setUrl(url); + instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource")); + instance.setCollectedfrom(collectedfrom); + instance.setHostedby(hostedby); + instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); + instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); + instance.setAccessright(prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes")); + instance.setLicense(field(doc.valueOf("//oaf:license"), info)); + res.add(instance); + } + } + return res; } @Override @@ -140,23 +156,7 @@ public class OafMigrationExecutor extends AbstractMongoExecutor { return null; } - @Override - protected List> prepareOtherResearchProductTools(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; - } - - @Override - protected List> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; - } - - @Override - protected List> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; - } + // SOFTWARES @Override protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { @@ -182,6 +182,7 @@ public class OafMigrationExecutor extends AbstractMongoExecutor { return null; } + // DATASETS @Override protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) { // TODO Auto-generated method stub @@ -224,23 +225,24 @@ public class OafMigrationExecutor extends AbstractMongoExecutor { return null; } - /* - * private StructuredProperty prepareStructProp(final Document doc, final String xpath, final DataInfo dataInfo) { if - * (StringUtils.isBlank(s)) { return null; } final String[] parts = s.split("###"); if (parts.length == 2) { final String value = - * parts[0]; final String[] arr = parts[1].split("@@@"); if (arr.length == 4) { return structuredProperty(value, arr[0], arr[1], arr[2], - * arr[3], dataInfo); } } return null; } - * - * private List prepareListOfStructProps(final Document doc, final String xpath, final DataInfo dataInfo) { final - * List res = new ArrayList<>(); if (array != null) { for (final String s : (String[]) array.getArray()) { final - * StructuredProperty sp = prepareStructProp(s, dataInfo); if (sp != null) { res.add(sp); } } } - * - * return res; } - * - * private Journal prepareJournal(final Document doc, final String xpath, final DataInfo info) { if (StringUtils.isNotBlank(sj)) { final - * String[] arr = sj.split("@@@"); if (arr.length == 3) { final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0] : null; final - * String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1] : null;; final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2] : null;; - * if (issn != null || eissn != null || lissn != null) { return journal(name, issn, eissn, eissn, null, null, null, null, null, null, - * null, info); } } } return null; } - */ + // OTHER PRODUCTS + + @Override + protected List> prepareOtherResearchProductTools(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java new file mode 100644 index 000000000..bb0932883 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java @@ -0,0 +1,209 @@ +package eu.dnetlib.dhp.migration; + +import java.util.List; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dom4j.Document; + +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.GeoLocation; +import eu.dnetlib.dhp.schema.oaf.Instance; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public class OdfMigrationExecutor extends AbstractMongoExecutor { + + private static final Log log = LogFactory.getLog(OdfMigrationExecutor.class); + + public OdfMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, final String mongoDb, + final String dbUrl, final String dbUser, + final String dbPassword) throws Exception { + super(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword); + } + + @Override + protected void registerNamespaces(final Map nsContext) { + // TODO Auto-generated method stub + + } + + @Override + protected List prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List> prepareSources(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected Field prepareEmbargoEndDate(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List prepareRelevantDates(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List> prepareCoverages(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List> prepareContributors(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List> prepareFormats(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected Field preparePublisher(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List> prepareDescriptions(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List prepareTitles(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List prepareSubjects(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected Qualifier prepareLanguages(final Document doc) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List prepareAuthors(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List> prepareOtherResearchProductTools(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected Field prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List prepareSoftwareLicenses(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected Field prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected Field prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected Field prepareDatasetVersion(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected Field prepareDatasetSize(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected Field prepareDatasetDevice(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected void addRelations(final List oafs, + final Document doc, + final String type, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { + // TODO Auto-generated method stub + + } + +} From 181e8498d4bfa1f848048715c2231bc3b6613907 Mon Sep 17 00:00:00 2001 From: Michele Artini Date: Fri, 7 Feb 2020 16:02:49 +0100 Subject: [PATCH 26/45] ... --- .../dhp/migration/AbstractMongoExecutor.java | 47 ++-- .../MigrateMongoMdstoresApplication.java | 7 +- .../dhp/migration/OafMigrationExecutor.java | 12 +- .../dhp/migration/OdfMigrationExecutor.java | 220 ++++++++++-------- 4 files changed, 151 insertions(+), 135 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java index cf1581b4d..1fa70dded 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java @@ -60,12 +60,7 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { final Map nsContext = new HashMap<>(); registerNamespaces(nsContext); - nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); - nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); - nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); - nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); - nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); - nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); + DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); } @@ -107,7 +102,13 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { } } - protected abstract void registerNamespaces(Map nsContext); + protected void registerNamespaces(final Map nsContext) { + nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); + nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); + nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); + nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); + nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); + } protected List createOafs(final Document doc, final String type, @@ -196,7 +197,7 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { r.setDescription(prepareDescriptions(doc, info)); r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info)); r.setPublisher(preparePublisher(doc, info)); - r.setEmbargoenddate(prepareEmbargoEndDate(doc, info)); + r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info)); r.setSource(prepareSources(doc, info)); r.setFulltext(null); // NOT PRESENT IN MDSTORES r.setFormat(prepareFormats(doc, info)); @@ -215,8 +216,6 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { protected abstract List> prepareSources(Document doc, DataInfo info); - protected abstract Field prepareEmbargoEndDate(Document doc, DataInfo info); - protected abstract List prepareRelevantDates(Document doc, DataInfo info); protected abstract List> prepareCoverages(Document doc, DataInfo info); @@ -289,20 +288,20 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { return null; } - protected Qualifier prepareQualifier(final Document doc, final String xpath, final String schemeId, final String schemeName) { - final String classId = doc.valueOf(xpath); + protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId, final String schemeName) { + final String classId = node.valueOf(xpath); final String className = code2name.get(classId); return qualifier(classId, className, schemeId, schemeName); } - protected List prepareListStructProps(final Document doc, + protected List prepareListStructProps(final Node node, final String xpath, final String xpathClassId, final String schemeId, final String schemeName, final DataInfo info) { final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes(xpath)) { + for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; final String classId = n.valueOf(xpathClassId); final String className = code2name.get(classId); @@ -311,18 +310,18 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { return res; } - protected List prepareListStructProps(final Document doc, final String xpath, final Qualifier qualifier, final DataInfo info) { + protected List prepareListStructProps(final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) { final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes(xpath)) { + for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; res.add(structuredProperty(n.getText(), qualifier, info)); } return res; } - protected List prepareListStructProps(final Document doc, final String xpath, final DataInfo info) { + protected List prepareListStructProps(final Node node, final String xpath, final DataInfo info) { final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes(xpath)) { + for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; res.add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n .valueOf("@schemename"), info)); @@ -359,17 +358,17 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); } - protected Field prepareField(final Document doc, final String xpath, final DataInfo info) { - return field(doc.valueOf(xpath), info); + protected Field prepareField(final Node node, final String xpath, final DataInfo info) { + return field(node.valueOf(xpath), info); } - protected List> prepareListFields(final Document doc, final String xpath, final DataInfo info) { - return listFields(info, (String[]) prepareListString(doc, xpath).toArray()); + protected List> prepareListFields(final Node node, final String xpath, final DataInfo info) { + return listFields(info, (String[]) prepareListString(node, xpath).toArray()); } - protected List prepareListString(final Document doc, final String xpath) { + protected List prepareListString(final Node node, final String xpath) { final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes(xpath)) { + for (final Object o : node.selectNodes(xpath)) { final String s = ((Node) o).getText().trim(); if (StringUtils.isNotBlank(s)) { res.add(s); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java index 124a4f3cc..359fe7596 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java @@ -31,8 +31,11 @@ public class MigrateMongoMdstoresApplication { new OafMigrationExecutor(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) { mig.processMdRecords(mdFormat, mdLayout, mdInterpretation); } - } else if (mdFormat.equalsIgnoreCase("oaf")) { - + } else if (mdFormat.equalsIgnoreCase("odf")) { + try (final OdfMigrationExecutor mig = + new OdfMigrationExecutor(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) { + mig.processMdRecords(mdFormat, mdLayout, mdInterpretation); + } } else { throw new RuntimeException("Format not supported: " + mdFormat); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java index f46b31732..6dcfae71f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java @@ -33,12 +33,8 @@ public class OafMigrationExecutor extends AbstractMongoExecutor { @Override protected void registerNamespaces(final Map nsContext) { + super.registerNamespaces(nsContext); nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); - nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); - nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); - nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); - nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); - nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); } @Override @@ -144,12 +140,6 @@ public class OafMigrationExecutor extends AbstractMongoExecutor { return prepareListFields(doc, "//dc:source", info); } - @Override - protected Field prepareEmbargoEndDate(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; - } - @Override protected List prepareRelevantDates(final Document doc, final DataInfo info) { // TODO Auto-generated method stub diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java index bb0932883..5e9c70ae5 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java @@ -1,11 +1,14 @@ package eu.dnetlib.dhp.migration; +import java.util.ArrayList; import java.util.List; import java.util.Map; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.dom4j.Document; +import org.dom4j.Node; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.DataInfo; @@ -29,134 +32,160 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor { @Override protected void registerNamespaces(final Map nsContext) { - // TODO Auto-generated method stub - - } - - @Override - protected List prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) { - // TODO Auto-generated method stub - return null; - } - - @Override - protected List> prepareSources(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; - } - - @Override - protected Field prepareEmbargoEndDate(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; - } - - @Override - protected List prepareRelevantDates(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; - } - - @Override - protected List> prepareCoverages(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; - } - - @Override - protected List> prepareContributors(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; - } - - @Override - protected List> prepareFormats(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; - } - - @Override - protected Field preparePublisher(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; - } - - @Override - protected List> prepareDescriptions(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + nsContext.put("dc", "http://datacite.org/schema/kernel-3"); } @Override protected List prepareTitles(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; - } - - @Override - protected List prepareSubjects(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; - } - - @Override - protected Qualifier prepareLanguages(final Document doc) { - // TODO Auto-generated method stub - return null; + return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info); } @Override protected List prepareAuthors(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + final List res = new ArrayList<>(); + int pos = 1; + for (final Object o : doc.selectNodes("//dc:creator")) { + final Node n = (Node) o; + final Author author = new Author(); + author.setFullname(n.valueOf("./dc:creatorName")); + author.setName(n.valueOf("./dc:givenName")); + author.setSurname(n.valueOf("./dc:familyName")); + author.setAffiliation(prepareListFields(doc, "./dc:affiliation", info)); + author.setPid(preparePids(doc, info)); + author.setRank(pos++); + res.add(author); + } + return res; + } + + private List preparePids(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes("./dc:nameIdentifier")) { + res.add(structuredProperty(((Node) o).getText(), prepareQualifier((Node) o, "./@nameIdentifierScheme", "dnet:pid_types", "dnet:pid_types"), info)); + } + return res; + } + + @Override + protected List prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes("//dc:alternateIdentifier[@alternateIdentifierType='URL']")) { + final Instance instance = new Instance(); + instance.setUrl(((Node) o).getText().trim()); + instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource")); + instance.setCollectedfrom(collectedfrom); + instance.setHostedby(hostedby); + instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); + instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); + instance.setAccessright(prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes")); + instance.setLicense(field(doc.valueOf("//oaf:license"), info)); + res.add(instance); + } + return res; + } + + @Override + protected List> prepareSources(final Document doc, final DataInfo info) { + return new ArrayList<>(); // Not present in ODF ??? + } + + @Override + protected List prepareRelevantDates(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes("//dc:date")) { + final String dateType = ((Node) o).valueOf("@dateType"); + if (StringUtils.isBlank(dateType) && !dateType.equalsIgnoreCase("Accepted") && !dateType.equalsIgnoreCase("Issued") + && !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Available")) { + res.add(structuredProperty(((Node) o).getText(), "UNKNOWN", "UNKNOWN", "dnet:dataCite_date", "dnet:dataCite_date", info)); + } + } + return res; + } + + @Override + protected List> prepareCoverages(final Document doc, final DataInfo info) { + return new ArrayList<>(); // Not present in ODF ??? + } + + @Override + protected List> prepareContributors(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:contributorName", info); + } + + @Override + protected List> prepareFormats(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:format", info); + } + + @Override + protected Field preparePublisher(final Document doc, final DataInfo info) { + return prepareField(doc, "//dc:publisher", info); + } + + @Override + protected List> prepareDescriptions(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:description[@descriptionType='Abstract']", info); + } + + @Override + protected List prepareSubjects(final Document doc, final DataInfo info) { + return prepareListStructProps(doc, "//dc:subject", info); + } + + @Override + protected Qualifier prepareLanguages(final Document doc) { + return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages"); } @Override protected List> prepareOtherResearchProductTools(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return new ArrayList<>(); // Not present in ODF ??? } @Override protected List> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return prepareListFields(doc, "//dc:contributor[@contributorType='ContactGroup']/dc:contributorName", info); } @Override protected List> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return prepareListFields(doc, "//dc:contributor[@contributorType='ContactPerson']/dc:contributorName", info); } @Override protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return prepareQualifier(doc, "//dc:format", "dnet:programming_languages", "dnet:programming_languages"); } @Override protected Field prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return null; // Not present in ODF ??? } @Override protected List prepareSoftwareLicenses(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return new ArrayList<>(); // Not present in ODF ??? } @Override protected List> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return prepareListFields(doc, "//dc:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info); } + // DATASETS + @Override protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + final List res = new ArrayList<>(); + + for (final Object o : doc.selectNodes("//dc:geoLocation")) { + final GeoLocation loc = new GeoLocation(); + loc.setBox(((Node) o).valueOf("./dc:geoLocationBox")); + loc.setPlace(((Node) o).valueOf("./dc:geoLocationPlace")); + loc.setPoint(((Node) o).valueOf("./dc:geoLocationPoint")); + res.add(loc); + } + return res; } @Override @@ -167,32 +196,27 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor { @Override protected Field prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return prepareField(doc, "//dc:date[@dateType='Updated']", info); } @Override protected Field prepareDatasetVersion(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return prepareField(doc, "//dc:version", info); } @Override protected Field prepareDatasetSize(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return prepareField(doc, "//dc:size", info); } @Override protected Field prepareDatasetDevice(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return null; // Not present in ODF ??? } @Override protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return prepareField(doc, "//dc:date[@dateType='Issued']", info); } @Override From 7f11d06a1ff44da94fdb74f67f7fc225b1158801 Mon Sep 17 00:00:00 2001 From: "sandro.labruzzo" Date: Mon, 10 Feb 2020 12:58:59 +0100 Subject: [PATCH 27/45] upgraded version of dnet-pace-core in pom.xml --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index aedf5ebff..a1db6894c 100644 --- a/pom.xml +++ b/pom.xml @@ -200,7 +200,7 @@ eu.dnetlib dnet-pace-core - 4.0.0-SNAPSHOT + 4.0.0 From 95740767e07cb6360797f48c73a0dc124ec2d903 Mon Sep 17 00:00:00 2001 From: Michele Artini Date: Mon, 10 Feb 2020 16:04:06 +0100 Subject: [PATCH 28/45] Ready for tests --- .../dhp/migration/AbstractMongoExecutor.java | 72 +++++-- .../dhp/migration/OafMigrationExecutor.java | 116 ++++++------ .../dhp/migration/OdfMigrationExecutor.java | 54 +++++- .../dhp/migration/pace/PacePerson.java | 176 ++++++++++++++++++ .../dhp/migration/pace/name_particles.txt | 7 + 5 files changed, 346 insertions(+), 79 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/pace/PacePerson.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/pace/name_particles.txt diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java index 1fa70dded..b2792e292 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java @@ -30,6 +30,7 @@ import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; @@ -164,14 +165,56 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { } if (!oafs.isEmpty()) { - addRelations(oafs, doc, "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO - addRelations(oafs, doc, "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO - addRelations(oafs, doc, "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO + oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp)); + oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp)); } return oafs; } + private List addProjectRels(final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { + + final List res = new ArrayList<>(); + + final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier")); + + for (final Object o : doc.selectNodes("//oaf:projectid")) { + final String projectId = createOpenaireId(40, ((Node) o).getText()); + + final Relation r1 = new Relation(); + r1.setRelType("resultProject"); + r1.setSubRelType("outcome"); + r1.setRelClass("isProducedBy"); + r1.setSource(docId); + r1.setTarget(projectId); + r1.setCollectedFrom(Arrays.asList(collectedFrom)); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r1); + + final Relation r2 = new Relation(); + r2.setRelType("resultProject"); + r2.setSubRelType("outcome"); + r2.setRelClass("produces"); + r2.setSource(projectId); + r2.setTarget(docId); + r2.setCollectedFrom(Arrays.asList(collectedFrom)); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r2); + } + + return res; + } + + protected abstract List addOtherResultRels(final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp); + private void populateResultFields(final Result r, final Document doc, final KeyValue collectedFrom, @@ -199,19 +242,21 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { r.setPublisher(preparePublisher(doc, info)); r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info)); r.setSource(prepareSources(doc, info)); - r.setFulltext(null); // NOT PRESENT IN MDSTORES + r.setFulltext(new ArrayList<>()); // NOT PRESENT IN MDSTORES r.setFormat(prepareFormats(doc, info)); r.setContributor(prepareContributors(doc, info)); - r.setResourcetype(null); // TODO + r.setResourcetype(prepareResourceType(doc, info)); r.setCoverage(prepareCoverages(doc, info)); - r.setRefereed(null); // TODO - r.setContext(null); // TODO - r.setExternalReference(null); // TODO + r.setRefereed(null); // NOT PRESENT IN MDSTORES + r.setContext(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy)); - r.setProcessingchargeamount(null); // TODO - r.setProcessingchargecurrency(null); // TODO + r.setProcessingchargeamount(null); // NOT PRESENT IN MDSTORES + r.setProcessingchargecurrency(null); // NOT PRESENT IN MDSTORES } + protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); + protected abstract List prepareInstances(Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); protected abstract List> prepareSources(Document doc, DataInfo info); @@ -264,13 +309,6 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { protected abstract Field prepareDatasetStorageDate(Document doc, DataInfo info); - abstract protected void addRelations(final List oafs, - final Document doc, - final String type, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp); - private Journal prepareJournal(final Document doc, final DataInfo info) { final Node n = doc.selectSingleNode("//oaf:journal"); if (n != null) { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java index 6dcfae71f..75360943c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java @@ -10,6 +10,7 @@ import org.apache.commons.logging.LogFactory; import org.dom4j.Document; import org.dom4j.Node; +import eu.dnetlib.dhp.migration.pace.PacePerson; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Field; @@ -37,29 +38,6 @@ public class OafMigrationExecutor extends AbstractMongoExecutor { nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); } - @Override - protected void addRelations(final List oafs, - final Document doc, - final String type, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { - for (final Object o : doc.selectNodes("//")) { // TODO - final Node n = (Node) o; - final Relation r = new Relation(); - r.setRelType(null); // TODO - r.setSubRelType(null); // TODO - r.setRelClass(null); // TODO - r.setSource(null); // TODO - r.setTarget(null); // TODO - r.setCollectedFrom(Arrays.asList(collectedFrom)); - r.setDataInfo(info); - r.setLastupdatetimestamp(lastUpdateTimestamp); - oafs.add(r); - } - - } - @Override protected List prepareAuthors(final Document doc, final DataInfo info) { final List res = new ArrayList<>(); @@ -69,6 +47,11 @@ public class OafMigrationExecutor extends AbstractMongoExecutor { final Author author = new Author(); author.setFullname(n.getText()); author.setRank(pos++); + final PacePerson p = new PacePerson(n.getText(), false); + if (p.isAccurate()) { + author.setName(p.getNormalisedFirstName()); + author.setSurname(p.getNormalisedSurname()); + } res.add(author); } return res; @@ -142,97 +125,124 @@ public class OafMigrationExecutor extends AbstractMongoExecutor { @Override protected List prepareRelevantDates(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return new ArrayList<>(); // NOT PRESENT IN OAF } // SOFTWARES @Override protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return null; // NOT PRESENT IN OAF } @Override protected Field prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return null; // NOT PRESENT IN OAF } @Override protected List prepareSoftwareLicenses(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return new ArrayList<>(); // NOT PRESENT IN OAF } @Override protected List> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return new ArrayList<>(); // NOT PRESENT IN OAF } // DATASETS @Override protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return new ArrayList<>(); // NOT PRESENT IN OAF } @Override protected Field prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return null; // NOT PRESENT IN OAF } @Override protected Field prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return null; // NOT PRESENT IN OAF } @Override protected Field prepareDatasetVersion(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return null; // NOT PRESENT IN OAF } @Override protected Field prepareDatasetSize(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return null; // NOT PRESENT IN OAF } @Override protected Field prepareDatasetDevice(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return null; // NOT PRESENT IN OAF } @Override protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return null; // NOT PRESENT IN OAF } // OTHER PRODUCTS @Override protected List> prepareOtherResearchProductTools(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return new ArrayList<>(); // NOT PRESENT IN OAF } @Override protected List> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return new ArrayList<>(); // NOT PRESENT IN OAF } @Override protected List> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return new ArrayList<>(); // NOT PRESENT IN OAF + } + + @Override + protected List addOtherResultRels(final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { + final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier")); + + final List res = new ArrayList<>(); + + for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) { + final String otherId = createOpenaireId(50, ((Node) o).getText()); + + final Relation r1 = new Relation(); + r1.setRelType("resultResult"); + r1.setSubRelType("publicationDataset"); + r1.setRelClass("isRelatedTo"); + r1.setSource(docId); + r1.setTarget(otherId); + r1.setCollectedFrom(Arrays.asList(collectedFrom)); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r1); + + final Relation r2 = new Relation(); + r2.setRelType("resultResult"); + r2.setSubRelType("publicationDataset"); + r2.setRelClass("isRelatedTo"); + r2.setSource(otherId); + r2.setTarget(docId); + r2.setCollectedFrom(Arrays.asList(collectedFrom)); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r2); + } + return res; + } + + @Override + protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java index 5e9c70ae5..b1dbfcdf4 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.migration; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Map; @@ -18,6 +19,7 @@ import eu.dnetlib.dhp.schema.oaf.Instance; import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; public class OdfMigrationExecutor extends AbstractMongoExecutor { @@ -190,8 +192,7 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor { @Override protected Field prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return null; // Not present in ODF ??? } @Override @@ -220,14 +221,49 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor { } @Override - protected void addRelations(final List oafs, - final Document doc, - final String type, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { - // TODO Auto-generated method stub + protected List addOtherResultRels(final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) { + final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier")); + + final List res = new ArrayList<>(); + + for (final Object o : doc.selectNodes("//*[local-name() = 'resource']//*[local-name()='relatedIdentifier' and ./@relatedIdentifierType='OPENAIRE']")) { + final String otherId = createOpenaireId(50, ((Node) o).getText()); + final String type = ((Node) o).valueOf("@relationType"); + + if (type.equals("IsSupplementTo")) { + res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "supplement", "isSupplementTo")); + res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "supplement", "isSupplementedBy")); + } else if (type.equals("IsPartOf")) { + res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "part", "IsPartOf")); + res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "part", "HasParts")); + } else {} + } + return res; + } + + private Relation prepareOtherResultRel(final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp, + final String source, + final String target, + final String subRelType, + final String relClass) { + final Relation r = new Relation(); + r.setRelType("resultResult"); + r.setSubRelType(subRelType); + r.setRelClass(relClass); + r.setSource(source); + r.setTarget(target); + r.setCollectedFrom(Arrays.asList(collectedFrom)); + r.setDataInfo(info); + r.setLastupdatetimestamp(lastUpdateTimestamp); + return r; + } + + @Override + protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { + return prepareQualifier(doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", "dnet:dataCite_resource", "dnet:dataCite_resource"); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/pace/PacePerson.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/pace/PacePerson.java new file mode 100644 index 000000000..927f5641b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/pace/PacePerson.java @@ -0,0 +1,176 @@ +package eu.dnetlib.dhp.migration.pace; + +import java.nio.charset.Charset; +import java.text.Normalizer; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.text.WordUtils; + +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.google.common.hash.Hashing; + +public class PacePerson { + + private static final String UTF8 = "UTF-8"; + private List name = Lists.newArrayList(); + private List surname = Lists.newArrayList(); + private List fullname = Lists.newArrayList(); + private final String original; + + private static Set particles = null; + + public static final String capitalize(final String s) { + return WordUtils.capitalize(s.toLowerCase(), ' ', '-'); + } + + public static final String dotAbbreviations(final String s) { + return s.length() == 1 ? s + "." : s; + } + + public static Set loadFromClasspath(final String classpath) { + final Set h = new HashSet<>(); + try { + for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) { + h.add(s); + } + } catch (final Throwable e) { + return new HashSet<>(); + } + return h; + } + + public PacePerson(String s, final boolean aggressive) { + original = s; + s = Normalizer.normalize(s, Normalizer.Form.NFD); + s = s.replaceAll("\\(.+\\)", ""); + s = s.replaceAll("\\[.+\\]", ""); + s = s.replaceAll("\\{.+\\}", ""); + s = s.replaceAll("\\s+-\\s+", "-"); + s = s.replaceAll("[\\p{Punct}&&[^,-]]", " "); + s = s.replaceAll("\\d", " "); + s = s.replaceAll("\\n", " "); + s = s.replaceAll("\\.", " "); + s = s.replaceAll("\\s+", " "); + + if (aggressive) { + s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", ""); + // s = s.replaceAll("[\\W&&[^,-]]", ""); + } + + if (s.contains(",")) { + final String[] arr = s.split(","); + if (arr.length == 1) { + fullname = splitTerms(arr[0]); + } else if (arr.length > 1) { + surname = splitTerms(arr[0]); + name = splitTerms(arr[1]); + fullname.addAll(surname); + fullname.addAll(name); + } + } else { + fullname = splitTerms(s); + + int lastInitialPosition = fullname.size(); + boolean hasSurnameInUpperCase = false; + + for (int i = 0; i < fullname.size(); i++) { + final String term = fullname.get(i); + if (term.length() == 1) { + lastInitialPosition = i; + } else if (term.equals(term.toUpperCase())) { + hasSurnameInUpperCase = true; + } + } + + if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini + name = fullname.subList(0, lastInitialPosition + 1); + surname = fullname.subList(lastInitialPosition + 1, fullname.size()); + } else if (hasSurnameInUpperCase) { // Case: Michele ARTINI + for (final String term : fullname) { + if (term.length() > 1 && term.equals(term.toUpperCase())) { + surname.add(term); + } else { + name.add(term); + } + } + } + } + } + + private List splitTerms(final String s) { + if (particles == null) { + particles = loadFromClasspath("/eu/dnetlib/dhp/migration/pace/name_particles.txt"); + } + + final List list = Lists.newArrayList(); + for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) { + if (!particles.contains(part.toLowerCase())) { + list.add(part); + } + } + return list; + } + + public List getName() { + return name; + } + + public String getNameString() { + return Joiner.on(" ").join(getName()); + } + + public List getSurname() { + return surname; + } + + public List getFullname() { + return fullname; + } + + public String getOriginal() { + return original; + } + + public String hash() { + return Hashing.murmur3_128().hashString(getNormalisedFullname(), Charset.forName(UTF8)).toString(); + } + + public String getNormalisedFirstName() { + return Joiner.on(" ").join(getCapitalFirstnames()); + } + + public String getNormalisedSurname() { + return Joiner.on(" ").join(getCapitalSurname()); + } + + public String getSurnameString() { + return Joiner.on(" ").join(getSurname()); + } + + public String getNormalisedFullname() { + return isAccurate() ? getNormalisedSurname() + ", " + getNormalisedFirstName() : Joiner.on(" ").join(fullname); + } + + public List getCapitalFirstnames() { + return Lists.newArrayList(Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize)); + } + + public List getCapitalSurname() { + return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize)); + } + + public List getNameWithAbbreviations() { + return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations)); + } + + public boolean isAccurate() { + return name != null && surname != null && !name.isEmpty() && !surname.isEmpty(); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/pace/name_particles.txt b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/pace/name_particles.txt new file mode 100644 index 000000000..dae37c9dc --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/pace/name_particles.txt @@ -0,0 +1,7 @@ +van +der +de +dell +sig +mr +mrs From 5fc09b179cf68d169070b43f47ec8828e3351e49 Mon Sep 17 00:00:00 2001 From: Michele Artini Date: Tue, 11 Feb 2020 12:48:03 +0100 Subject: [PATCH 29/45] bug fixing --- dhp-workflows/dhp-aggregation/pom.xml | 6 ++++++ .../dnetlib/dhp/migration/AbstractMigrationExecutor.java | 7 +++++++ .../src/main/java/eu/dnetlib/dhp/migration/DbClient.java | 7 +++++-- .../dhp/migration/migrate_db_entities_parameters.json | 4 ++-- 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index d031c0308..d523945ea 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -55,6 +55,12 @@ org.mongodb mongo-java-driver + + + org.postgresql + postgresql + 42.2.10 + org.mockito diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java index 389790511..bf877dcf3 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java @@ -10,6 +10,8 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -42,7 +44,12 @@ public class AbstractMigrationExecutor implements Closeable { private final SequenceFile.Writer writer; + private static final Log log = LogFactory.getLog(AbstractMigrationExecutor.class); + public AbstractMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser) throws Exception { + + log.info(String.format("Creating SequenceFile Writer, hdfsPath=%s, nameNode=%s, user=%s", hdfsPath, hdfsNameNode, hdfsUser)); + this.writer = SequenceFile.createWriter(getConf(hdfsNameNode, hdfsUser), SequenceFile.Writer.file(new Path(hdfsPath)), SequenceFile.Writer .keyClass(IntWritable.class), SequenceFile.Writer.valueClass(Text.class)); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/DbClient.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/DbClient.java index e9fee63b9..246dae474 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/DbClient.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/DbClient.java @@ -9,6 +9,7 @@ import java.sql.SQLException; import java.sql.Statement; import java.util.function.Consumer; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -22,7 +23,9 @@ public class DbClient implements Closeable { try { Class.forName("org.postgresql.Driver"); - this.connection = DriverManager.getConnection(address, login, password); + + this.connection = + StringUtils.isNoneBlank(login, password) ? DriverManager.getConnection(address, login, password) : DriverManager.getConnection(address); this.connection.setAutoCommit(false); } catch (final Exception e) { log.error(e.getClass().getName() + ": " + e.getMessage()); @@ -34,7 +37,7 @@ public class DbClient implements Closeable { public void processResults(final String sql, final Consumer consumer) { try (final Statement stmt = connection.createStatement()) { - try (final ResultSet rs = stmt.executeQuery("SELECT * FROM COMPANY;")) { + try (final ResultSet rs = stmt.executeQuery(sql)) { while (rs.next()) { consumer.accept(rs); } diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json index 861d297ba..5e9f378f5 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json @@ -27,12 +27,12 @@ "paramName": "dbuser", "paramLongName": "postgresUser", "paramDescription": "postgres user", - "paramRequired": true + "paramRequired": false }, { "paramName": "dbpasswd", "paramLongName": "postgresPassword", "paramDescription": "postgres password", - "paramRequired": true + "paramRequired": false } ] \ No newline at end of file From 06c2fd6df90d1113f06b641f296ecce63740c104 Mon Sep 17 00:00:00 2001 From: Michele Artini Date: Tue, 11 Feb 2020 15:29:50 +0100 Subject: [PATCH 30/45] bug fixing --- .../migration/AbstractMigrationExecutor.java | 6 +++- .../eu/dnetlib/dhp/migration/DbClient.java | 2 ++ .../MigrateDbEntitiesApplication.java | 36 +++++++++++++------ .../dhp/migration/sql/queryProjects.sql | 9 +++-- .../src/main/resources/log4j.properties | 9 +++++ 5 files changed, 47 insertions(+), 15 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/log4j.properties diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java index bf877dcf3..3367399c6 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java @@ -71,12 +71,13 @@ public class AbstractMigrationExecutor implements Closeable { value.set(objectMapper.writeValueAsString(oaf)); writer.append(key, value); } catch (final Exception e) { - e.printStackTrace(); + throw new RuntimeException(e); } } @Override public void close() throws IOException { + writer.hflush(); writer.close(); } @@ -216,4 +217,7 @@ public class AbstractMigrationExecutor implements Closeable { } + public static String asString(final Object o) { + return o == null ? "" : o.toString(); + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/DbClient.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/DbClient.java index 246dae474..9ac0089d2 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/DbClient.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/DbClient.java @@ -37,6 +37,8 @@ public class DbClient implements Closeable { public void processResults(final String sql, final Consumer consumer) { try (final Statement stmt = connection.createStatement()) { + stmt.setFetchSize(100); + try (final ResultSet rs = stmt.executeQuery(sql)) { while (rs.next()) { consumer.accept(rs); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java index 12043709f..d22e8e5b3 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java @@ -54,11 +54,22 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl final String hdfsUser = parser.get("hdfsUser"); try (final MigrateDbEntitiesApplication smdbe = new MigrateDbEntitiesApplication(hdfsPath, hdfsNameNode, hdfsUser, dbUrl, dbUser, dbPassword)) { + log.info("Processing datasources..."); smdbe.execute("queryDatasources.sql", smdbe::processDatasource); + + log.info("Processing projects..."); smdbe.execute("queryProjects.sql", smdbe::processProject); + + log.info("Processing orgs..."); smdbe.execute("queryOrganizations.sql", smdbe::processOrganization); + + log.info("Processing relations ds <-> orgs ..."); smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization); + + log.info("Processing projects <-> orgs ..."); smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization); + + log.info("All done."); } } @@ -75,6 +86,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl } public void processDatasource(final ResultSet rs) { + try { final DataInfo info = prepareDataInfo(rs); @@ -85,7 +97,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl ds.setOriginalId(Arrays.asList(rs.getString("datasourceid"))); ds.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); ds.setPid(new ArrayList<>()); - ds.setDateofcollection(rs.getDate("dateofcollection").toString()); + ds.setDateofcollection(asString(rs.getDate("dateofcollection"))); ds.setDateoftransformation(null); // Value not returned by the SQL query ds.setExtraInfo(new ArrayList<>()); // Values not present in the DB ds.setOaiprovenance(null); // Values not present in the DB @@ -99,17 +111,17 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl ds.setNamespaceprefix(field(rs.getString("namespaceprefix"), info)); ds.setLatitude(field(Double.toString(rs.getDouble("latitude")), info)); ds.setLongitude(field(Double.toString(rs.getDouble("longitude")), info)); - ds.setDateofvalidation(field(rs.getDate("dateofvalidation").toString(), info)); + ds.setDateofvalidation(field(asString(rs.getDate("dateofvalidation")), info)); ds.setDescription(field(rs.getString("description"), info)); ds.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info)); ds.setOdnumberofitems(field(Double.toString(rs.getInt("odnumberofitems")), info)); - ds.setOdnumberofitemsdate(field(rs.getDate("odnumberofitemsdate").toString(), info)); + ds.setOdnumberofitemsdate(field(asString(rs.getDate("odnumberofitemsdate")), info)); ds.setOdpolicies(field(rs.getString("odpolicies"), info)); ds.setOdlanguages(prepareListFields(rs.getArray("odlanguages"), info)); ds.setOdcontenttypes(prepareListFields(rs.getArray("odcontenttypes"), info)); ds.setAccessinfopackage(prepareListFields(rs.getArray("accessinfopackage"), info)); - ds.setReleasestartdate(field(rs.getDate("releasestartdate").toString(), info)); - ds.setReleaseenddate(field(rs.getDate("releaseenddate").toString(), info)); + ds.setReleasestartdate(field(asString(rs.getDate("releasestartdate")), info)); + ds.setReleaseenddate(field(asString(rs.getDate("releaseenddate")), info)); ds.setMissionstatementurl(field(rs.getString("missionstatementurl"), info)); ds.setDataprovider(field(rs.getBoolean("dataprovider"), info)); ds.setServiceprovider(field(rs.getBoolean("serviceprovider"), info)); @@ -192,16 +204,16 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl p.setOriginalId(Arrays.asList(rs.getString("projectid"))); p.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); p.setPid(new ArrayList<>()); - p.setDateofcollection(rs.getDate("dateofcollection").toString()); - p.setDateoftransformation(rs.getDate("dateoftransformation").toString()); + p.setDateofcollection(asString(rs.getDate("dateofcollection"))); + p.setDateoftransformation(asString(rs.getDate("dateoftransformation"))); p.setExtraInfo(new ArrayList<>()); // Values not present in the DB p.setOaiprovenance(null); // Values not present in the DB p.setWebsiteurl(field(rs.getString("websiteurl"), info)); p.setCode(field(rs.getString("code"), info)); p.setAcronym(field(rs.getString("acronym"), info)); p.setTitle(field(rs.getString("title"), info)); - p.setStartdate(field(rs.getDate("startdate").toString(), info)); - p.setEnddate(field(rs.getDate("enddate").toString(), info)); + p.setStartdate(field(asString(rs.getDate("startdate")), info)); + p.setEnddate(field(asString(rs.getDate("enddate")), info)); p.setCallidentifier(field(rs.getString("callidentifier"), info)); p.setKeywords(field(rs.getString("keywords"), info)); p.setDuration(field(Integer.toString(rs.getInt("duration")), info)); @@ -271,6 +283,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl } public void processOrganization(final ResultSet rs) { + try { final DataInfo info = prepareDataInfo(rs); @@ -281,8 +294,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl o.setOriginalId(Arrays.asList(rs.getString("organizationid"))); o.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); o.setPid(new ArrayList<>()); - o.setDateofcollection(rs.getDate("dateofcollection").toString()); - o.setDateoftransformation(rs.getDate("dateoftransformation").toString()); + o.setDateofcollection(asString(rs.getDate("dateofcollection"))); + o.setDateoftransformation(asString(rs.getDate("dateoftransformation"))); o.setExtraInfo(new ArrayList<>()); // Values not present in the DB o.setOaiprovenance(null); // Values not present in the DB o.setLegalshortname(field("legalshortname", info)); @@ -387,6 +400,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl } public void processProjectOrganization(final ResultSet rs) { + try { final DataInfo info = prepareDataInfo(rs); final String orgId = createOpenaireId(20, rs.getString("resporganization")); diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects.sql index f04f1f03b..6cff18875 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects.sql +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects.sql @@ -31,7 +31,7 @@ SELECT p.fundedamount AS fundedamount, dc.id AS collectedfromid, dc.officialname AS collectedfromname, - p.contracttype || '@@@' || p.contracttypename || '@@@' || p.contracttypescheme || '@@@' || p.contracttypescheme AS contracttype, + ctc.code || '@@@' || ctc.name || '@@@' || cts.code || '@@@' || cts.name AS contracttype, pac.code || '@@@' || pac.name || '@@@' || pas.code || '@@@' || pas.name AS provenanceaction, array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid, array_agg(DISTINCT s.name || '###' || sc.code || '@@@' || sc.name || '@@@' || ss.code || '@@@' || ss.name) AS subjects, @@ -54,6 +54,9 @@ SELECT LEFT OUTER JOIN class sc ON (sc.code = s.semanticclass) LEFT OUTER JOIN scheme ss ON (ss.code = s.semanticscheme) + LEFT OUTER JOIN class ctc ON (ctc.code = p.contracttypeclass) + LEFT OUTER JOIN scheme cts ON (cts.code = p.contracttypescheme) + GROUP BY p.id, p.code, @@ -77,11 +80,11 @@ SELECT p.contactfax, p.contactphone, p.contactemail, - p.contracttype, p.summary, p.currency, p.totalcost, p.fundedamount, dc.id, dc.officialname, - pac.code, pac.name, pas.code, pas.name; \ No newline at end of file + pac.code, pac.name, pas.code, pas.name, + ctc.code, ctc.name, cts.code, cts.name; \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/log4j.properties b/dhp-workflows/dhp-aggregation/src/main/resources/log4j.properties new file mode 100644 index 000000000..63cba917e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/log4j.properties @@ -0,0 +1,9 @@ +# Set root logger level to DEBUG and its only appender to A1. +log4j.rootLogger=INFO, A1 + +# A1 is set to be a ConsoleAppender. +log4j.appender.A1=org.apache.log4j.ConsoleAppender + +# A1 uses PatternLayout. +log4j.appender.A1.layout=org.apache.log4j.PatternLayout +log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n From 69336195d3aaefe95a557f592893de4d8f3b79b1 Mon Sep 17 00:00:00 2001 From: Michele Artini Date: Wed, 12 Feb 2020 11:12:38 +0100 Subject: [PATCH 31/45] simplifications --- .../migration/AbstractMigrationExecutor.java | 47 +++++++++++-------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java index 3367399c6..11c1fb6ae 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java @@ -6,6 +6,7 @@ import java.net.URI; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Objects; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; @@ -15,7 +16,6 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.codehaus.jackson.map.ObjectMapper; @@ -36,7 +36,7 @@ public class AbstractMigrationExecutor implements Closeable { private final AtomicInteger counter = new AtomicInteger(0); - private final IntWritable key = new IntWritable(counter.get()); + private final Text key = new Text(); private final Text value = new Text(); @@ -51,7 +51,7 @@ public class AbstractMigrationExecutor implements Closeable { log.info(String.format("Creating SequenceFile Writer, hdfsPath=%s, nameNode=%s, user=%s", hdfsPath, hdfsNameNode, hdfsUser)); this.writer = SequenceFile.createWriter(getConf(hdfsNameNode, hdfsUser), SequenceFile.Writer.file(new Path(hdfsPath)), SequenceFile.Writer - .keyClass(IntWritable.class), SequenceFile.Writer.valueClass(Text.class)); + .keyClass(Text.class), SequenceFile.Writer.valueClass(Text.class)); } private Configuration getConf(final String hdfsNameNode, final String hdfsUser) throws IOException { @@ -67,7 +67,7 @@ public class AbstractMigrationExecutor implements Closeable { protected void emitOaf(final Oaf oaf) { try { - key.set(counter.getAndIncrement()); + key.set(counter.getAndIncrement() + ":" + oaf.getClass().getSimpleName().toLowerCase()); value.set(objectMapper.writeValueAsString(oaf)); writer.append(key, value); } catch (final Exception e) { @@ -99,6 +99,8 @@ public class AbstractMigrationExecutor implements Closeable { } public static Field field(final T value, final DataInfo info) { + if (value == null || StringUtils.isBlank(value.toString())) { return null; } + final Field field = new Field<>(); field.setValue(value); field.setDataInfo(info); @@ -106,7 +108,7 @@ public class AbstractMigrationExecutor implements Closeable { } public static List> listFields(final DataInfo info, final String... values) { - return Arrays.stream(values).map(v -> field(v, info)).collect(Collectors.toList()); + return Arrays.stream(values).map(v -> field(v, info)).filter(Objects::nonNull).collect(Collectors.toList()); } public static Qualifier qualifier(final String classid, final String classname, final String schemeid, final String schemename) { @@ -124,10 +126,12 @@ public class AbstractMigrationExecutor implements Closeable { final String schemeid, final String schemename, final DataInfo dataInfo) { + return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo); } public static StructuredProperty structuredProperty(final String value, final Qualifier qualifier, final DataInfo dataInfo) { + if (value == null) { return null; } final StructuredProperty sp = new StructuredProperty(); sp.setValue(value); sp.setQualifier(qualifier); @@ -178,20 +182,25 @@ public class AbstractMigrationExecutor implements Closeable { final String conferenceplace, final String conferencedate, final DataInfo dataInfo) { - final Journal j = new Journal(); - j.setName(name); - j.setIssnPrinted(issnPrinted); - j.setIssnOnline(issnOnline); - j.setIssnLinking(issnLinking); - j.setEp(ep); - j.setIss(iss); - j.setSp(sp); - j.setVol(vol); - j.setEdition(edition); - j.setConferenceplace(conferenceplace); - j.setConferencedate(conferencedate); - j.setDataInfo(dataInfo); - return j; + + if (StringUtils.isNotBlank(name) || StringUtils.isNotBlank(issnPrinted) || StringUtils.isNotBlank(issnOnline) || StringUtils.isNotBlank(issnLinking)) { + final Journal j = new Journal(); + j.setName(name); + j.setIssnPrinted(issnPrinted); + j.setIssnOnline(issnOnline); + j.setIssnLinking(issnLinking); + j.setEp(ep); + j.setIss(iss); + j.setSp(sp); + j.setVol(vol); + j.setEdition(edition); + j.setConferenceplace(conferenceplace); + j.setConferencedate(conferencedate); + j.setDataInfo(dataInfo); + return j; + } else { + return null; + } } public static DataInfo dataInfo(final Boolean deletedbyinference, From cdea0dae75abff4ea83c4ab4d40e01d6ab93c749 Mon Sep 17 00:00:00 2001 From: Michele Artini Date: Wed, 12 Feb 2020 16:34:00 +0100 Subject: [PATCH 32/45] bug fixing --- .../dhp/migration/AbstractMongoExecutor.java | 19 +++++++++++++++++-- .../migrate_mongo_mstores_parameters.json | 10 +++++----- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java index b2792e292..83e05c59f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java @@ -11,6 +11,8 @@ import java.util.Map; import java.util.Map.Entry; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.DocumentFactory; @@ -49,6 +51,8 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { protected static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies"); protected static final Qualifier OTHER_RESULTTYPE_QUALIFIER = qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies"); + private static final Log log = LogFactory.getLog(AbstractMongoExecutor.class); + public AbstractMongoExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, final String mongoDb, final String dbUrl, final String dbUser, final String dbPassword) throws Exception { @@ -66,6 +70,9 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { } private void loadClassNames(final String dbUrl, final String dbUser, final String dbPassword) throws IOException { + + log.info("Loading vocabulary terms from db..."); + try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) { code2name.clear(); dbClient.processResults("select code, name from class", rs -> { @@ -77,12 +84,19 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { }); } + log.info("Found " + code2name.size() + " terms."); + } public void processMdRecords(final String mdFormat, final String mdLayout, final String mdInterpretation) throws DocumentException { - for (final Entry entry : mdstoreClient.validCollections(mdFormat, mdLayout, mdInterpretation).entrySet()) { - // final String mdId = entry.getKey(); + log.info(String.format("Searching mdstores (format: %s, layout: %s, interpretation: %s)", mdFormat, mdLayout, mdInterpretation)); + + final Map colls = mdstoreClient.validCollections(mdFormat, mdLayout, mdInterpretation); + log.info("Found " + colls.size() + " mdstores"); + + for (final Entry entry : colls.entrySet()) { + log.info("Processing mdstore " + entry.getKey() + " (collection: " + entry.getValue() + ")"); final String currentColl = entry.getValue(); for (final String xml : mdstoreClient.listRecords(currentColl)) { @@ -101,6 +115,7 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { } } } + log.info("All Done."); } protected void registerNamespaces(final Map nsContext) { diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json index 3cd6f39f5..5738daa76 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json @@ -48,21 +48,21 @@ "paramRequired": true }, { - "paramName": "postgresUrl", + "paramName": "pgurl", "paramLongName": "postgresUrl", "paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb", "paramRequired": true }, { - "paramName": "postgresUser", + "paramName": "pguser", "paramLongName": "postgresUser", "paramDescription": "postgres user", - "paramRequired": true + "paramRequired": false }, { - "paramName": "postgresPassword", + "paramName": "pgpasswd", "paramLongName": "postgresPassword", "paramDescription": "postgres password", - "paramRequired": true + "paramRequired": false } ] \ No newline at end of file From 80cb52593f80c1287498747b69286eaf730db943 Mon Sep 17 00:00:00 2001 From: Michele Artini Date: Thu, 13 Feb 2020 15:34:13 +0100 Subject: [PATCH 33/45] bug fixing --- .../migration/AbstractMigrationExecutor.java | 4 +++ .../dhp/migration/AbstractMongoExecutor.java | 5 +++- .../dnetlib/dhp/migration/MdstoreClient.java | 25 ++++++++++++------- .../dhp/migration/OdfMigrationExecutor.java | 1 + 4 files changed, 25 insertions(+), 10 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java index 11c1fb6ae..e91a53045 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java @@ -111,6 +111,10 @@ public class AbstractMigrationExecutor implements Closeable { return Arrays.stream(values).map(v -> field(v, info)).filter(Objects::nonNull).collect(Collectors.toList()); } + public static List> listFields(final DataInfo info, final List values) { + return values.stream().map(v -> field(v, info)).filter(Objects::nonNull).collect(Collectors.toList()); + } + public static Qualifier qualifier(final String classid, final String classname, final String schemeid, final String schemename) { final Qualifier q = new Qualifier(); q.setClassid(classid); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java index 83e05c59f..d1b618c7a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java @@ -385,6 +385,8 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { protected OAIProvenance prepareOAIprovenance(final Document doc) { final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); + if (n == null) { return null; } + final String identifier = n.valueOf("./*[local-name()='identifier']"); final String baseURL = n.valueOf("./*[local-name()='baseURL']");; final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']");; @@ -393,6 +395,7 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { final String harvestDate = n.valueOf("@harvestDate");; return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate); + } protected DataInfo prepareDataInfo(final Document doc) { @@ -416,7 +419,7 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { } protected List> prepareListFields(final Node node, final String xpath, final DataInfo info) { - return listFields(info, (String[]) prepareListString(node, xpath).toArray()); + return listFields(info, prepareListString(node, xpath)); } protected List prepareListString(final Node node, final String xpath) { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MdstoreClient.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MdstoreClient.java index 971d7f165..87dadfc7a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MdstoreClient.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MdstoreClient.java @@ -2,6 +2,7 @@ package eu.dnetlib.dhp.migration; import java.io.Closeable; import java.io.IOException; +import java.util.ArrayList; import java.util.HashMap; import java.util.Map; import java.util.stream.StreamSupport; @@ -35,7 +36,7 @@ public class MdstoreClient implements Closeable { public Map validCollections(final String mdFormat, final String mdLayout, final String mdInterpretation) { final Map transactions = new HashMap<>(); - for (final Document entry : getColl(db, COLL_METADATA_MANAGER).find()) { + for (final Document entry : getColl(db, COLL_METADATA_MANAGER, true).find()) { final String mdId = entry.getString("mdId"); final String currentId = entry.getString("currentId"); if (StringUtils.isNoneBlank(mdId, currentId)) { @@ -44,7 +45,7 @@ public class MdstoreClient implements Closeable { } final Map res = new HashMap<>(); - for (final Document entry : getColl(db, COLL_METADATA).find()) { + for (final Document entry : getColl(db, COLL_METADATA, true).find()) { if (entry.getString("format").equals(mdFormat) && entry.getString("layout").equals(mdLayout) && entry.getString("interpretation").equals(mdInterpretation) && transactions.containsKey(entry.getString("mdId"))) { res.put(entry.getString("mdId"), transactions.get(entry.getString("mdId"))); @@ -63,20 +64,26 @@ public class MdstoreClient implements Closeable { return client.getDatabase(dbName); } - private MongoCollection getColl(final MongoDatabase db, final String collName) { + private MongoCollection getColl(final MongoDatabase db, final String collName, final boolean abortIfMissing) { if (!Iterables.contains(db.listCollectionNames(), collName)) { final String err = String.format(String.format("Missing collection '%s' in database '%s'", collName, db.getName())); log.warn(err); - throw new RuntimeException(err); + if (abortIfMissing) { + throw new RuntimeException(err); + } else { + return null; + } } return db.getCollection(collName); } - public Iterable listRecords(final String coll) { - return () -> StreamSupport.stream(getColl(db, coll).find().spliterator(), false) - .filter(e -> e.containsKey("body")) - .map(e -> e.getString("body")) - .iterator(); + public Iterable listRecords(final String collName) { + final MongoCollection coll = getColl(db, collName, false); + return coll == null ? new ArrayList<>() + : () -> StreamSupport.stream(coll.find().spliterator(), false) + .filter(e -> e.containsKey("body")) + .map(e -> e.getString("body")) + .iterator(); } @Override diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java index b1dbfcdf4..54636b3bf 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java @@ -34,6 +34,7 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor { @Override protected void registerNamespaces(final Map nsContext) { + super.registerNamespaces(nsContext); nsContext.put("dc", "http://datacite.org/schema/kernel-3"); } From 956da2f923bf622b66c6c9695e21367656f303a9 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 13 Feb 2020 16:49:45 +0100 Subject: [PATCH 34/45] added Saxon-HE extension functions and Transformer factory class --- dhp-common/pom.xml | 4 ++ .../saxon/AbstractExtensionFunction.java | 32 +++++++++ .../dnetlib/dhp/utils/saxon/ExtractYear.java | 67 +++++++++++++++++++ .../dhp/utils/saxon/NormalizeDate.java | 66 ++++++++++++++++++ .../eu/dnetlib/dhp/utils/saxon/PickFirst.java | 53 +++++++++++++++ .../utils/saxon/SaxonTransformerFactory.java | 30 +++++++++ 6 files changed, 252 insertions(+) create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index 43c2a3834..ae7302b98 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -42,6 +42,10 @@ com.rabbitmq amqp-client + + net.sf.saxon + Saxon-HE + diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java new file mode 100644 index 000000000..bd3962440 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java @@ -0,0 +1,32 @@ +package eu.dnetlib.dhp.utils.saxon; + +import net.sf.saxon.expr.XPathContext; +import net.sf.saxon.lib.ExtensionFunctionCall; +import net.sf.saxon.lib.ExtensionFunctionDefinition; +import net.sf.saxon.om.Sequence; +import net.sf.saxon.om.StructuredQName; +import net.sf.saxon.trans.XPathException; + +public abstract class AbstractExtensionFunction extends ExtensionFunctionDefinition { + + public static String DEFAULT_SAXON_EXT_NS_URI = "http://www.d-net.research-infrastructures.eu/saxon-extension"; + + public abstract String getName(); + public abstract Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException; + + @Override + public StructuredQName getFunctionQName() { + return new StructuredQName("dnet", DEFAULT_SAXON_EXT_NS_URI, getName()); + } + + @Override + public ExtensionFunctionCall makeCallExpression() { + return new ExtensionFunctionCall() { + @Override + public Sequence call(XPathContext context, Sequence[] arguments) throws XPathException { + return doCall(context, arguments); + } + }; + } + +} \ No newline at end of file diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java new file mode 100644 index 000000000..f90e2a23e --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java @@ -0,0 +1,67 @@ +package eu.dnetlib.dhp.utils.saxon; + +import net.sf.saxon.expr.XPathContext; +import net.sf.saxon.om.Item; +import net.sf.saxon.om.Sequence; +import net.sf.saxon.trans.XPathException; +import net.sf.saxon.value.SequenceType; +import net.sf.saxon.value.StringValue; + +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.GregorianCalendar; + +public class ExtractYear extends AbstractExtensionFunction { + + private static final String[] dateFormats = { "yyyy-MM-dd", "yyyy/MM/dd" }; + + @Override + public String getName() { + return "extractYear"; + } + + @Override + public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { + if (arguments == null | arguments.length == 0) { + return new StringValue(""); + } + final Item item = arguments[0].head(); + if (item == null) { + return new StringValue(""); + } + return new StringValue(_year(item.getStringValue())); + } + + @Override + public int getMinimumNumberOfArguments() { + return 0; + } + + @Override + public int getMaximumNumberOfArguments() { + return 1; + } + + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { SequenceType.OPTIONAL_ITEM }; + } + + @Override + public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) { + return SequenceType.SINGLE_STRING; + } + + private String _year(String s) { + Calendar c = new GregorianCalendar(); + for (String format : dateFormats) { + try { + c.setTime(new SimpleDateFormat(format).parse(s)); + String year = String.valueOf(c.get(Calendar.YEAR)); + return year; + } catch (ParseException e) {} + } + return ""; + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java new file mode 100644 index 000000000..634e08788 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java @@ -0,0 +1,66 @@ +package eu.dnetlib.dhp.utils.saxon; + +import net.sf.saxon.expr.XPathContext; +import net.sf.saxon.om.Sequence; +import net.sf.saxon.trans.XPathException; +import net.sf.saxon.value.SequenceType; +import net.sf.saxon.value.StringValue; + +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Date; + +public class NormalizeDate extends AbstractExtensionFunction { + + private static final String[] normalizeDateFormats = { "yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "yyyy/MM/dd", "yyyy" }; + + private static final String normalizeOutFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'"); + + @Override + public String getName() { + return "normalizeDate"; + } + + @Override + public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { + if (arguments == null | arguments.length == 0) { + return new StringValue(""); + } + String s = arguments[0].head().getStringValue(); + return new StringValue(_year(s)); + } + + @Override + public int getMinimumNumberOfArguments() { + return 0; + } + + @Override + public int getMaximumNumberOfArguments() { + return 1; + } + + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { SequenceType.OPTIONAL_ITEM }; + } + + @Override + public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) { + return SequenceType.SINGLE_STRING; + } + + private String _year(String s) { + final String date = s != null ? s.trim() : ""; + + for (String format : normalizeDateFormats) { + try { + Date parse = new SimpleDateFormat(format).parse(date); + String res = new SimpleDateFormat(normalizeOutFormat).format(parse); + return res; + } catch (ParseException e) {} + } + return ""; + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java new file mode 100644 index 000000000..1f209bed0 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java @@ -0,0 +1,53 @@ +package eu.dnetlib.dhp.utils.saxon; + +import net.sf.saxon.expr.XPathContext; +import net.sf.saxon.om.Sequence; +import net.sf.saxon.trans.XPathException; +import net.sf.saxon.value.SequenceType; +import net.sf.saxon.value.StringValue; +import org.apache.commons.lang3.StringUtils; + +public class PickFirst extends AbstractExtensionFunction { + + @Override + public String getName() { + return "pickFirst"; + } + + @Override + public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { + if (arguments == null | arguments.length == 0) { + return new StringValue(""); + } + String s1 = arguments[0].head().getStringValue(); + + if (arguments.length > 1) { + String s2 = arguments[1].head().getStringValue(); + + return new StringValue(StringUtils.isNotBlank(s1) ? s1 : StringUtils.isNotBlank(s2) ? s2 : ""); + } else { + return new StringValue(StringUtils.isNotBlank(s1) ? s1 : ""); + } + } + + @Override + public int getMinimumNumberOfArguments() { + return 0; + } + + @Override + public int getMaximumNumberOfArguments() { + return 2; + } + + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { SequenceType.OPTIONAL_ITEM }; + } + + @Override + public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) { + return SequenceType.SINGLE_STRING; + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java new file mode 100644 index 000000000..611709ff0 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java @@ -0,0 +1,30 @@ +package eu.dnetlib.dhp.utils.saxon; + +import net.sf.saxon.Configuration; +import net.sf.saxon.TransformerFactoryImpl; + +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerException; +import javax.xml.transform.stream.StreamSource; +import java.io.StringReader; + +public class SaxonTransformerFactory { + + /** + * Creates the index record transformer from the given XSLT + * @param xslt + * @return + * @throws TransformerException + */ + public static Transformer newInstance(final String xslt) throws TransformerException { + + final TransformerFactoryImpl factory = new TransformerFactoryImpl(); + final Configuration conf = factory.getConfiguration(); + conf.registerExtensionFunction(new ExtractYear()); + conf.registerExtensionFunction(new NormalizeDate()); + conf.registerExtensionFunction(new PickFirst()); + + return factory.newTransformer(new StreamSource(new StringReader(xslt))); + } + +} From 1fee6e2b7e4b06226a3769bb961de7657b858f86 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 13 Feb 2020 16:53:27 +0100 Subject: [PATCH 35/45] implemented XML records construction and serialization, indexing WIP --- .gitignore | 4 +- .../dhp/graph/SparkGraphImporterJob.java | 15 +- .../job-override.properties | 16 +- dhp-workflows/dhp-graph-provision/pom.xml | 46 + .../eu/dnetlib/dhp/graph/GraphJoiner.java | 151 ++- .../dnetlib/dhp/graph/GraphMappingUtils.java | 160 --- .../eu/dnetlib/dhp/graph/LinkedEntity.java | 29 - .../dhp/graph/SparkXmlIndexingJob.java | 188 ++++ .../dhp/graph/SparkXmlRecordBuilderJob.java | 38 +- .../main/java/eu/dnetlib/dhp/graph/Tuple.java | 29 - .../eu/dnetlib/dhp/graph/TupleWrapper.java | 29 - .../graph/{ => model}/EntityRelEntity.java | 3 +- .../dhp/graph/{ => model}/JoinedEntity.java | 11 +- .../eu/dnetlib/dhp/graph/model/Links.java | 6 + .../dhp/graph/{ => model}/RelatedEntity.java | 22 +- .../eu/dnetlib/dhp/graph/model/Tuple2.java | 28 + .../dhp/graph/{ => model}/TypedRow.java | 2 +- .../dnetlib/dhp/graph/utils/ContextDef.java | 51 + .../dhp/graph/utils/ContextMapper.java | 45 + .../dhp/graph/utils/GraphMappingUtils.java | 254 +++++ .../graph/utils/ISLookupClientFactory.java | 24 + .../dhp/graph/utils/LicenseComparator.java | 49 + .../utils/StreamingInputDocumentFactory.java | 253 +++++ .../dhp/graph/utils/TemplateFactory.java | 107 ++ .../dhp/graph/utils/TemplateResources.java | 54 + .../dhp/graph/utils/XmlRecordFactory.java | 962 ++++++++++++++++++ .../graph/utils/XmlSerializationUtils.java | 151 +++ .../javax.xml.transform.TransformerFactory | 1 + ...> input_params_build_adjacency_lists.json} | 3 +- .../dhp/graph/input_params_update_index.json | 7 + .../dnetlib/dhp/graph/oozie_app/workflow.xml | 45 +- .../eu/dnetlib/dhp/graph/template/child.st | 3 + .../eu/dnetlib/dhp/graph/template/entity.st | 10 + .../eu/dnetlib/dhp/graph/template/instance.st | 4 + .../eu/dnetlib/dhp/graph/template/record.st | 17 + .../eu/dnetlib/dhp/graph/template/rel.st | 4 + .../dnetlib/dhp/graph/template/webresource.st | 3 + .../dnetlib/dhp/graph/MappingUtilsTest.java | 24 +- .../dhp/graph/XmlRecordFactoryTest.java | 55 + .../eu/dnetlib/dhp/graph/software.json | 1 + pom.xml | 69 +- 41 files changed, 2571 insertions(+), 402 deletions(-) delete mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java delete mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntity.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlIndexingJob.java delete mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Tuple.java delete mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TupleWrapper.java rename dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/{ => model}/EntityRelEntity.java (96%) rename dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/{ => model}/JoinedEntity.java (65%) create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/Links.java rename dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/{ => model}/RelatedEntity.java (94%) create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/Tuple2.java rename dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/{ => model}/TypedRow.java (96%) create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ContextDef.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ContextMapper.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/GraphMappingUtils.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ISLookupClientFactory.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/LicenseComparator.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/StreamingInputDocumentFactory.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/TemplateFactory.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/TemplateResources.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlRecordFactory.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlSerializationUtils.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/resources/META-INF/services/javax.xml.transform.TransformerFactory rename dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/{input_graph_parameters.json => input_params_build_adjacency_lists.json} (65%) create mode 100644 dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_params_update_index.json create mode 100644 dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/child.st create mode 100644 dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/entity.st create mode 100644 dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/instance.st create mode 100644 dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/record.st create mode 100644 dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/rel.st create mode 100644 dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/webresource.st create mode 100644 dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/XmlRecordFactoryTest.java create mode 100644 dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/software.json diff --git a/.gitignore b/.gitignore index 3f00d9729..66fe55aa9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ .DS_Store .idea *.iml +*.ipr +*.iws *~ .classpath /*/.classpath @@ -18,5 +20,5 @@ /*/build /build spark-warehouse -/*/*/job-override.properties +/**/job-override.properties diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java index a6a4e9291..5401b71c1 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java @@ -33,17 +33,12 @@ public class SparkGraphImporterJob { // Read the input file and convert it into RDD of serializable object GraphMappingUtils.types.forEach((name, clazz) -> { - final JavaRDD> inputRDD = sc.sequenceFile(inputPath + "/" + name, Text.class, Text.class) - .map(item -> new Tuple2<>(item._1.toString(), item._2.toString())); - - spark.createDataset(inputRDD - .filter(s -> s._1().equals(clazz.getName())) - .map(Tuple2::_2) - .map(s -> new ObjectMapper().readValue(s, clazz)) + spark.createDataset(sc.sequenceFile(inputPath + "/" + name, Text.class, Text.class) + .map(s -> new ObjectMapper().readValue(s._2().toString(), clazz)) .rdd(), Encoders.bean(clazz)) - .write() - .mode(SaveMode.Overwrite) - .saveAsTable(hiveDbName + "." + name); + .write() + .mode(SaveMode.Overwrite) + .saveAsTable(hiveDbName + "." + name); }); } diff --git a/dhp-workflows/dhp-graph-provision/job-override.properties b/dhp-workflows/dhp-graph-provision/job-override.properties index acaf16717..c7b173a14 100644 --- a/dhp-workflows/dhp-graph-provision/job-override.properties +++ b/dhp-workflows/dhp-graph-provision/job-override.properties @@ -1,5 +1,11 @@ -sparkDriverMemory=7G -sparkExecutorMemory=7G -hive_db_name=claudio -sourcePath=/tmp/db_openaireplus_services_beta.export.2019.11.06 -outputPath=/tmp/openaire_provision \ No newline at end of file +sparkDriverMemory=8G +sparkExecutorMemory=8G +#isLookupUrl=http://services.openaire.eu:8280/is/services/isLookUp +isLookupUrl=http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl +sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03 +outputPath=/tmp/openaire_provision +format=TMF +batchSize=1000 +sparkExecutorCoresForIndexing=1 +sparkExecutorInstances=10 +reuseRecords=false \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml index 62d8ac2ae..5e6beb249 100644 --- a/dhp-workflows/dhp-graph-provision/pom.xml +++ b/dhp-workflows/dhp-graph-provision/pom.xml @@ -23,6 +23,52 @@ com.jayway.jsonpath json-path + + dom4j + dom4j + + + jaxen + jaxen + + + com.mycila.xmltool + xmltool + + + org.antlr + stringtemplate + + + org.apache.solr + solr-solrj + + + com.lucidworks.spark + spark-solr + + + + org.apache.httpcomponents + httpclient + + + org.noggit + noggit + + + org.apache.zookeeper + zookeeper + + + + org.apache.cxf + cxf-rt-transports-http + + + eu.dnetlib + cnr-rmi-api + eu.dnetlib.dhp diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java index f7bf0da39..062c8886b 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java @@ -6,9 +6,14 @@ import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.jayway.jsonpath.DocumentContext; import com.jayway.jsonpath.JsonPath; +import eu.dnetlib.dhp.graph.model.*; +import eu.dnetlib.dhp.graph.utils.ContextMapper; +import eu.dnetlib.dhp.graph.utils.GraphMappingUtils; +import eu.dnetlib.dhp.graph.utils.XmlRecordFactory; import eu.dnetlib.dhp.schema.oaf.*; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -18,9 +23,12 @@ import scala.Tuple2; import java.io.IOException; import java.io.Serializable; +import java.util.HashSet; import java.util.List; import java.util.stream.Collectors; +import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.asRelatedEntity; + /** * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. * The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, @@ -41,16 +49,21 @@ import java.util.stream.Collectors; */ public class GraphJoiner implements Serializable { - public static final int MAX_RELS = 10; + public static final int MAX_RELS = 100; + + public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd"; private SparkSession spark; + private ContextMapper contextMapper; + private String inputPath; private String outPath; - public GraphJoiner(SparkSession spark, String inputPath, String outPath) { + public GraphJoiner(SparkSession spark, ContextMapper contextMapper, String inputPath, String outPath) { this.spark = spark; + this.contextMapper = contextMapper; this.inputPath = inputPath; this.outPath = outPath; } @@ -68,7 +81,7 @@ public class GraphJoiner implements Serializable { JavaPairRDD publication = readPathEntity(sc, getInputPath(), "publication"); // create the union between all the entities - final String entitiesPath = getOutPath() + "/0_entities"; + final String entitiesPath = getOutPath() + "/entities"; datasource .union(organization) .union(project) @@ -94,102 +107,74 @@ public class GraphJoiner implements Serializable { .flatMap(p -> p.iterator()) .mapToPair(p -> new Tuple2<>(p.getRelation().getTargetId(), p)); - final String joinByTargetPath = getOutPath() + "/1_join_by_target"; - relation + //final String bySource = getOutPath() + "/1_join_by_target"; + JavaPairRDD bySource = relation .join(entities .filter(e -> !e._2().getSource().getDeleted()) - .mapToPair(e -> new Tuple2<>(e._1(), new GraphMappingUtils().pruneModel(e._2())))) + .mapToPair(e -> new Tuple2<>(e._1(), asRelatedEntity(e._2())))) .map(s -> new EntityRelEntity() .setRelation(s._2()._1().getRelation()) .setTarget(s._2()._2().getSource())) - .map(GraphMappingUtils::serialize) - .saveAsTextFile(joinByTargetPath, GzipCodec.class); - - JavaPairRDD bySource = sc.textFile(joinByTargetPath) - .map(t -> new ObjectMapper().readValue(t, EntityRelEntity.class)) .mapToPair(t -> new Tuple2<>(t.getRelation().getSourceId(), t)); - final String linkedEntityPath = getOutPath() + "/2_linked_entities"; + final XmlRecordFactory recordFactory = new XmlRecordFactory(contextMapper, false, schemaLocation, new HashSet<>()); entities .union(bySource) .groupByKey() // by source id - .map(p -> toLinkedEntity(p)) - .map(e -> new ObjectMapper().setSerializationInclusion(JsonInclude.Include.NON_NULL).writeValueAsString(e)) - .saveAsTextFile(linkedEntityPath, GzipCodec.class); - - final String joinedEntitiesPath = getOutPath() + "/3_joined_entities"; - sc.textFile(linkedEntityPath) - .map(s -> new ObjectMapper().readValue(s, LinkedEntity.class)) .map(l -> toJoinedEntity(l)) - .map(j -> new ObjectMapper().setSerializationInclusion(JsonInclude.Include.NON_NULL).writeValueAsString(j)) - .saveAsTextFile(joinedEntitiesPath); + .mapToPair(je -> new Tuple2<>( + new Text(je.getEntity().getId()), + new Text(recordFactory.build(je)))) + .saveAsHadoopFile(getOutPath() + "/xml", Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); return this; } public GraphJoiner asXML() { final JavaSparkContext sc = new JavaSparkContext(getSpark().sparkContext()); + final XmlRecordFactory recordFactory = new XmlRecordFactory(contextMapper, true, "", new HashSet<>()); + final ObjectMapper mapper = new ObjectMapper(); - final String joinedEntitiesPath = getOutPath() + "/3_joined_entities"; + final String joinedEntitiesPath = getOutPath() + "/1_joined_entities"; sc.textFile(joinedEntitiesPath) - .map(s -> new ObjectMapper().readValue(s, LinkedEntity.class)) - .map(l -> toXML(l)) - .saveAsTextFile(getOutPath() + "/4_xml"); + .map(s -> mapper.readValue(s, JoinedEntity.class)) + .mapToPair(je -> new Tuple2<>(new Text(je.getEntity().getId()), new Text(recordFactory.build(je)))) + .saveAsHadoopFile(getOutPath() + "/2_xml", Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); return this; } - private String toXML(LinkedEntity l) { - - return null; - } - public SparkSession getSpark() { return spark; } - public GraphJoiner setSpark(SparkSession spark) { - this.spark = spark; - return this; - } - public String getInputPath() { return inputPath; } - public GraphJoiner setInputPath(String inputPath) { - this.inputPath = inputPath; - return this; - } - public String getOutPath() { return outPath; } - public GraphJoiner setOutPath(String outPath) { - this.outPath = outPath; - return this; - } - // HELPERS private OafEntity parseOaf(final String json, final String type) { final ObjectMapper o = new ObjectMapper(); try { - switch (type) { - case "publication": + switch (GraphMappingUtils.EntityType.valueOf(type)) { + case publication: return o.readValue(json, Publication.class); - case "dataset": + case dataset: return o.readValue(json, Dataset.class); - case "otherresearchproduct": + case otherresearchproduct: return o.readValue(json, OtherResearchProduct.class); - case "software": + case software: return o.readValue(json, Software.class); - case "datasource": + case datasource: return o.readValue(json, Datasource.class); - case "organization": + case organization: return o.readValue(json, Organization.class); - case "project": + case project: return o.readValue(json, Project.class); default: throw new IllegalArgumentException("invalid type: " + type); @@ -199,56 +184,36 @@ public class GraphJoiner implements Serializable { } } - /** - * Converts the result of grouping pairs and the entities by source id to LinkedEntity - * @param p - * @return - */ - private LinkedEntity toLinkedEntity(Tuple2> p) { - final LinkedEntity e = new LinkedEntity(); - final List links = Lists.newArrayList(); + private JoinedEntity toJoinedEntity(Tuple2> p) { + final ObjectMapper o = new ObjectMapper(); + final JoinedEntity j = new JoinedEntity(); + final Links links2 = new Links(); for(EntityRelEntity rel : p._2()) { - if (rel.hasMainEntity() & e.getEntity() == null) { - e.setEntity(rel.getSource()); + if (rel.hasMainEntity() & j.getEntity() == null) { + j.setType(rel.getSource().getType()); + j.setEntity(parseOaf(rel.getSource().getOaf(), rel.getSource().getType())); } if (rel.hasRelatedEntity()) { - links.add(new Tuple() - .setRelation(rel.getRelation()) - .setTarget(rel.getTarget())); + try { + links2.add( + new eu.dnetlib.dhp.graph.model.Tuple2() + .setRelation(o.readValue(rel.getRelation().getOaf(), Relation.class)) + .setRelatedEntity(o.readValue(rel.getTarget().getOaf(), RelatedEntity.class))); + } catch (IOException e) { + throw new IllegalArgumentException(e); + } } } - e.setLinks(links); - if (e.getEntity() == null) { + j.setLinks(links2); + if (j.getEntity() == null) { throw new IllegalStateException("missing main entity on '" + p._1() + "'"); } - return e; - } - - /** - * Converts a LinkedEntity to a JoinedEntity - * @param l - * @return - */ - private JoinedEntity toJoinedEntity(LinkedEntity l) { - return new JoinedEntity().setType(l.getEntity().getType()) - .setEntity(parseOaf(l.getEntity().getOaf(), l.getEntity().getType())) - .setLinks(l.getLinks() - .stream() - .map(t -> { - final ObjectMapper o = new ObjectMapper(); - try { - return new Tuple2<>( - o.readValue(t.getRelation().getOaf(), Relation.class), - o.readValue(t.getTarget().getOaf(), RelatedEntity.class)); - } catch (IOException e) { - throw new IllegalArgumentException(e); - } - }).collect(Collectors.toList())); + return j; } /** * Reads a set of eu.dnetlib.dhp.schema.oaf.OafEntity objects from a sequence file , - * extracts necessary information using json path, wraps the oaf object in a eu.dnetlib.dhp.graph.TypedRow + * extracts necessary information using json path, wraps the oaf object in a eu.dnetlib.dhp.graph.model.TypedRow * @param sc * @param inputPath * @param type @@ -270,7 +235,7 @@ public class GraphJoiner implements Serializable { /** * Reads a set of eu.dnetlib.dhp.schema.oaf.Relation objects from a sequence file , - * extracts necessary information using json path, wraps the oaf object in a eu.dnetlib.dhp.graph.TypedRow + * extracts necessary information using json path, wraps the oaf object in a eu.dnetlib.dhp.graph.model.TypedRow * @param sc * @param inputPath * @return the JavaRDD containing all the relationships diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java deleted file mode 100644 index e3622cd20..000000000 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java +++ /dev/null @@ -1,160 +0,0 @@ -package eu.dnetlib.dhp.graph; - -import com.fasterxml.jackson.annotation.JsonInclude; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.Maps; -import com.jayway.jsonpath.DocumentContext; -import com.jayway.jsonpath.JsonPath; -import eu.dnetlib.dhp.schema.oaf.*; -import net.minidev.json.JSONArray; -import org.apache.commons.lang3.StringUtils; - -import java.util.LinkedHashMap; -import java.util.Map; -import java.util.stream.Collectors; - -public class GraphMappingUtils { - - public final static Map types = Maps.newHashMap(); - - static { - types.put("datasource", Datasource.class); - types.put("organization", Organization.class); - types.put("project", Project.class); - types.put("dataset", Dataset.class); - types.put("otherresearchproduct", OtherResearchProduct.class); - types.put("software", Software.class); - types.put("publication", Publication.class); - types.put("relation", Relation.class); - } - - public static EntityRelEntity pruneModel(EntityRelEntity e) { - - final DocumentContext j = JsonPath.parse(e.getSource().getOaf()); - final RelatedEntity re = new RelatedEntity().setId(j.read("$.id")).setType(e.getSource().getType()); - - switch (e.getSource().getType()) { - case "publication": - case "dataset": - case "otherresearchproduct": - case "software": - mapTitle(j, re); - re.setDateofacceptance(j.read("$.dateofacceptance.value")); - re.setPublisher(j.read("$.publisher.value")); - - JSONArray pids = j.read("$.pid"); - re.setPid(pids.stream() - .map(p -> asStructuredProperty((LinkedHashMap) p)) - .collect(Collectors.toList())); - - re.setResulttype(asQualifier(j.read("$.resulttype"))); - - JSONArray collfrom = j.read("$.collectedfrom"); - re.setCollectedfrom(collfrom.stream() - .map(c -> asKV((LinkedHashMap)c)) - .collect(Collectors.toList())); - - //TODO still to be mapped - //re.setCodeRepositoryUrl(j.read("$.coderepositoryurl")); - - break; - case "datasource": - re.setOfficialname(j.read("$.officialname.value")); - re.setWebsiteurl(j.read("$.websiteurl.value")); - re.setDatasourcetype(asQualifier(j.read("$.datasourcetype"))); - re.setOpenairecompatibility(asQualifier(j.read("$.openairecompatibility"))); - - break; - case "organization": - re.setLegalname(j.read("$.legalname.value")); - re.setLegalshortname(j.read("$.legalshortname.value")); - re.setCountry(asQualifier(j.read("$.country"))); - - break; - case "project": - re.setProjectTitle(j.read("$.title.value")); - re.setCode(j.read("$.code.value")); - re.setAcronym(j.read("$.acronym.value")); - re.setContracttype(asQualifier(j.read("$.contracttype"))); - - JSONArray f = j.read("$.fundingtree"); - if (!f.isEmpty()) { - re.setFundingtree(f.stream() - .map(s -> s.toString()) - .collect(Collectors.toList())); - } - - break; - } - return new EntityRelEntity().setSource( - new TypedRow() - .setSourceId(e.getSource().getSourceId()) - .setDeleted(e.getSource().getDeleted()) - .setType(e.getSource().getType()) - .setOaf(serialize(re))); - } - - private static KeyValue asKV(LinkedHashMap j) { - final KeyValue kv = new KeyValue(); - kv.setKey((String) j.get("key")); - kv.setValue((String) j.get("value")); - return kv; - } - - private static void mapTitle(DocumentContext j, RelatedEntity re) { - final JSONArray a = j.read("$.title"); - if (!a.isEmpty()) { - final StructuredProperty sp = asStructuredProperty((LinkedHashMap) a.get(0)); - if(StringUtils.isNotBlank(sp.getValue())) { - re.setTitle(sp); - } - } - } - - private static StructuredProperty asStructuredProperty(LinkedHashMap j) { - final StructuredProperty sp = new StructuredProperty(); - final String value = (String) j.get("value"); - if (StringUtils.isNotBlank(value)) { - sp.setValue((String) j.get("value")); - sp.setQualifier(asQualifier((LinkedHashMap) j.get("qualifier"))); - } - return sp; - } - - public static Qualifier asQualifier(LinkedHashMap j) { - final Qualifier q = new Qualifier(); - - final String classid = j.get("classid"); - if (StringUtils.isNotBlank(classid)) { - q.setClassid(classid); - } - - final String classname = j.get("classname"); - if (StringUtils.isNotBlank(classname)) { - q.setClassname(classname); - } - - final String schemeid = j.get("schemeid"); - if (StringUtils.isNotBlank(schemeid)) { - q.setSchemeid(schemeid); - } - - final String schemename = j.get("schemename"); - if (StringUtils.isNotBlank(schemename)) { - q.setSchemename(schemename); - } - return q; - } - - public static String serialize(final Object o) { - try { - return new ObjectMapper() - .setSerializationInclusion(JsonInclude.Include.NON_NULL) - .writeValueAsString(o); - } catch (JsonProcessingException e) { - throw new IllegalArgumentException("unable to serialize: " + o.toString(), e); - } - } - -} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntity.java deleted file mode 100644 index 9e6fc0d38..000000000 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/LinkedEntity.java +++ /dev/null @@ -1,29 +0,0 @@ -package eu.dnetlib.dhp.graph; - -import java.io.Serializable; -import java.util.List; - -public class LinkedEntity implements Serializable { - - private TypedRow entity; - - private List links; - - public TypedRow getEntity() { - return entity; - } - - public LinkedEntity setEntity(TypedRow entity) { - this.entity = entity; - return this; - } - - public List getLinks() { - return links; - } - - public LinkedEntity setLinks(List links) { - this.links = links; - return this; - } -} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlIndexingJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlIndexingJob.java new file mode 100644 index 000000000..e13f8bbe2 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlIndexingJob.java @@ -0,0 +1,188 @@ +package eu.dnetlib.dhp.graph; + +import com.lucidworks.spark.util.SolrSupport; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.graph.utils.ISLookupClientFactory; +import eu.dnetlib.dhp.graph.utils.StreamingInputDocumentFactory; +import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpDocumentNotFoundException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.io.Text; +import org.apache.solr.common.SolrInputDocument; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.SparkSession; + +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerException; +import javax.xml.transform.stream.StreamResult; +import javax.xml.transform.stream.StreamSource; +import java.io.IOException; +import java.io.StringReader; +import java.io.StringWriter; +import java.text.SimpleDateFormat; +import java.util.Date; + +public class SparkXmlIndexingJob { + + private static final Log log = LogFactory.getLog(SparkXmlIndexingJob.class); + + private static final Integer DEFAULT_BATCH_SIZE = 1000; + + private static final String LAYOUT = "index"; + + public static void main(String[] args) throws Exception { + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkXmlIndexingJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_params_update_index.json"))); + parser.parseArgument(args); + + final String inputPath = parser.get("sourcePath"); + final String isLookupUrl = parser.get("isLookupUrl"); + final String format = parser.get("format"); + final Integer batchSize = parser.getObjectMap().containsKey("batckSize") ? Integer.valueOf(parser.get("batchSize")) : DEFAULT_BATCH_SIZE; + + final ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl); + final String fields = getLayoutSource(isLookup, format); + final String xslt = getLayoutTransformer(isLookup); + + final String dsId = getDsId(format, isLookup); + final String zkHost = getZkHost(isLookup); + final String version = getRecordDatestamp(); + + final String indexRecordXslt = getLayoutTransformer(format, fields, xslt); + + log.info("indexRecordTransformer: " + indexRecordXslt); + + final String master = parser.get("master"); + final SparkConf conf = new SparkConf() + .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + + try(SparkSession spark = getSession(conf, master)) { + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + RDD docs = sc.sequenceFile(inputPath, Text.class, Text.class) + .map(t -> t._2().toString()) + .map(s -> toIndexRecord(SaxonTransformerFactory.newInstance(indexRecordXslt), s)) + .map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s)) + .rdd(); + + SolrSupport.indexDocs(zkHost, format + "-" + LAYOUT + "-openaire", batchSize, docs); + } + } + + private static SparkSession getSession(SparkConf conf, String master) { + return SparkSession + .builder() + .config(conf) + .appName(SparkXmlRecordBuilderJob.class.getSimpleName()) + .master(master) + .getOrCreate(); + } + + private static String toIndexRecord(Transformer tr, final String record) { + final StreamResult res = new StreamResult(new StringWriter()); + try { + tr.transform(new StreamSource(new StringReader(record)), res); + return res.getWriter().toString(); + } catch (Throwable e) { + System.out.println("XPathException on record:\n" + record); + throw new IllegalArgumentException(e); + } + } + + /** + * Creates the XSLT responsible for building the index xml records. + * + * @param format Metadata format name (DMF|TMF) + * @param xslt xslt for building the index record transformer + * @param fields the list of fields + * @return the javax.xml.transform.Transformer + * @throws ISLookUpException could happen + * @throws IOException could happen + * @throws TransformerException could happen + */ + private static String getLayoutTransformer(String format, String fields, String xslt) throws TransformerException { + + final Transformer layoutTransformer = SaxonTransformerFactory.newInstance(xslt); + final StreamResult layoutToXsltXslt = new StreamResult(new StringWriter()); + + layoutTransformer.setParameter("format", format); + layoutTransformer.transform(new StreamSource(new StringReader(fields)), layoutToXsltXslt); + + return layoutToXsltXslt.getWriter().toString(); + } + + /** + * method return a solr-compatible string representation of a date, used to mark all records as indexed today + * @return the parsed date + */ + public static String getRecordDatestamp() { + return new SimpleDateFormat("yyyy-MM-dd'T'hh:mm:ss'Z'").format(new Date()); + } + + /** + * Method retrieves from the information system the list of fields associated to the given MDFormat name + * + * @param isLookup the ISLookup service stub + * @param format the Metadata format name + * @return the string representation of the list of fields to be indexed + * + * @throws ISLookUpDocumentNotFoundException + * @throws ISLookUpException + */ + private static String getLayoutSource(final ISLookUpService isLookup, final String format) throws ISLookUpDocumentNotFoundException, ISLookUpException { + return doLookup(isLookup, String.format( + "collection('')//RESOURCE_PROFILE[.//RESOURCE_TYPE/@value = 'MDFormatDSResourceType' and .//NAME='%s']//LAYOUT[@name='%s']", format, LAYOUT)); + } + + /** + * Method retrieves from the information system the openaireLayoutToRecordStylesheet + * + * @param isLookup the ISLookup service stub + * @return the string representation of the XSLT contained in the transformation rule profile + * + * @throws ISLookUpDocumentNotFoundException + * @throws ISLookUpException + */ + private static String getLayoutTransformer(ISLookUpService isLookup) throws ISLookUpException { + return doLookup(isLookup, "collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType')" + + "//RESOURCE_PROFILE[./BODY/CONFIGURATION/SCRIPT/TITLE/text() = 'openaireLayoutToRecordStylesheet']//CODE/node()"); + } + + /** + * Method retrieves from the information system the IndexDS profile ID associated to the given MDFormat name + * @param format + * @param isLookup + * @return the IndexDS identifier + * @throws ISLookUpException + */ + private static String getDsId(String format, ISLookUpService isLookup) throws ISLookUpException { + return doLookup(isLookup, String.format("collection('/db/DRIVER/IndexDSResources/IndexDSResourceType')" + + "//RESOURCE_PROFILE[./BODY/CONFIGURATION/METADATA_FORMAT/text() = '%s']//RESOURCE_IDENTIFIER/@value/string()", format)); + } + + /** + * Method retrieves from the information system the zookeeper quorum of the Solr server + * @param isLookup + * @return the zookeeper quorum of the Solr server + * @throws ISLookUpException + */ + private static String getZkHost(ISLookUpService isLookup) throws ISLookUpException { + return doLookup(isLookup, "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='IndexServiceResourceType'] return $x//PROTOCOL[./@name='solr']/@address/string()"); + } + + private static String doLookup(ISLookUpService isLookup, String xquery) throws ISLookUpException { + log.info(String.format("running xquery: %s", xquery)); + final String res = isLookup.getResourceProfileByQuery(xquery); + log.info(String.format("got response (100 chars): %s", StringUtils.left(res, 100) + " ...")); + return res; + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlRecordBuilderJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlRecordBuilderJob.java index 38bc2bae2..0b2180f19 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlRecordBuilderJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlRecordBuilderJob.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.graph; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.graph.utils.ContextMapper; import org.apache.commons.io.IOUtils; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -11,30 +12,37 @@ public class SparkXmlRecordBuilderJob { public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkXmlRecordBuilderJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_graph_parameters.json"))); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkXmlRecordBuilderJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_params_build_adjacency_lists.json"))); parser.parseArgument(args); + final String master = parser.get("master"); final SparkConf conf = new SparkConf() .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - final SparkSession spark = SparkSession + try(SparkSession spark = getSession(conf, master)) { + + final String inputPath = parser.get("sourcePath"); + final String outputPath = parser.get("outputPath"); + final String isLookupUrl = parser.get("isLookupUrl"); + + final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); + if (fs.exists(new Path(outputPath))) { + fs.delete(new Path(outputPath), true); + fs.mkdirs(new Path(outputPath)); + } + + new GraphJoiner(spark, ContextMapper.fromIS(isLookupUrl), inputPath, outputPath) + .adjacencyLists(); + } + } + + private static SparkSession getSession(SparkConf conf, String master) { + return SparkSession .builder() .config(conf) .appName(SparkXmlRecordBuilderJob.class.getSimpleName()) - .master(parser.get("master")) + .master(master) .getOrCreate(); - - final String inputPath = parser.get("sourcePath"); - final String outputPath = parser.get("outputPath"); - - final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); - if (fs.exists(new Path(outputPath))) { - fs.delete(new Path(outputPath), true); - fs.mkdirs(new Path(outputPath)); - } - - new GraphJoiner(spark, inputPath, outputPath) - .adjacencyLists(); } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Tuple.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Tuple.java deleted file mode 100644 index 1eb0491a7..000000000 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/Tuple.java +++ /dev/null @@ -1,29 +0,0 @@ -package eu.dnetlib.dhp.graph; - -import java.io.Serializable; - -public class Tuple implements Serializable { - - private TypedRow relation; - - private TypedRow target; - - - public TypedRow getRelation() { - return relation; - } - - public Tuple setRelation(TypedRow relation) { - this.relation = relation; - return this; - } - - public TypedRow getTarget() { - return target; - } - - public Tuple setTarget(TypedRow target) { - this.target = target; - return this; - } -} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TupleWrapper.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TupleWrapper.java deleted file mode 100644 index eb60e1474..000000000 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TupleWrapper.java +++ /dev/null @@ -1,29 +0,0 @@ -package eu.dnetlib.dhp.graph; - -import java.io.Serializable; - -public class TupleWrapper implements Serializable { - - private TypedRow relation; - - private TypedRow target; - - - public TypedRow getRelation() { - return relation; - } - - public TupleWrapper setRelation(TypedRow relation) { - this.relation = relation; - return this; - } - - public TypedRow getTarget() { - return target; - } - - public TupleWrapper setTarget(TypedRow target) { - this.target = target; - return this; - } -} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityRelEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/EntityRelEntity.java similarity index 96% rename from dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityRelEntity.java rename to dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/EntityRelEntity.java index 285cacbc0..8c08337e2 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/EntityRelEntity.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/EntityRelEntity.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dhp.graph; +package eu.dnetlib.dhp.graph.model; import java.io.Serializable; @@ -15,7 +15,6 @@ public class EntityRelEntity implements Serializable { this.source = source; } - //helpers public Boolean hasMainEntity() { return getSource() != null & getRelation() == null & getTarget() == null; diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/JoinedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/JoinedEntity.java similarity index 65% rename from dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/JoinedEntity.java rename to dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/JoinedEntity.java index d65eb64c8..f89273a0d 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/JoinedEntity.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/JoinedEntity.java @@ -1,11 +1,8 @@ -package eu.dnetlib.dhp.graph; +package eu.dnetlib.dhp.graph.model; import eu.dnetlib.dhp.schema.oaf.OafEntity; -import eu.dnetlib.dhp.schema.oaf.Relation; -import scala.Tuple2; import java.io.Serializable; -import java.util.List; public class JoinedEntity implements Serializable { @@ -13,7 +10,7 @@ public class JoinedEntity implements Serializable { private OafEntity entity; - private List> links; + private Links links; public String getType() { return type; @@ -33,11 +30,11 @@ public class JoinedEntity implements Serializable { return this; } - public List> getLinks() { + public Links getLinks() { return links; } - public JoinedEntity setLinks(List> links) { + public JoinedEntity setLinks(Links links) { this.links = links; return this; } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/Links.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/Links.java new file mode 100644 index 000000000..96ad67b0c --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/Links.java @@ -0,0 +1,6 @@ +package eu.dnetlib.dhp.graph.model; + +import java.util.ArrayList; + +public class Links extends ArrayList { +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/RelatedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/RelatedEntity.java similarity index 94% rename from dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/RelatedEntity.java rename to dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/RelatedEntity.java index 50b97dace..baeff1c6a 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/RelatedEntity.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/RelatedEntity.java @@ -1,5 +1,6 @@ -package eu.dnetlib.dhp.graph; +package eu.dnetlib.dhp.graph.model; +import eu.dnetlib.dhp.schema.oaf.Instance; import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; @@ -25,6 +26,7 @@ public class RelatedEntity implements Serializable { private String codeRepositoryUrl; private Qualifier resulttype; private List collectedfrom; + private List instances; // datasource private String officialname; @@ -45,14 +47,6 @@ public class RelatedEntity implements Serializable { private Qualifier contracttype; private List fundingtree; - public static RelatedEntity parse(final String json) { - try { - return new ObjectMapper().readValue(json, RelatedEntity.class); - } catch (IOException e) { - throw new IllegalArgumentException("invalid RelatedEntity, cannot parse: " + json); - } - } - public String getId() { return id; } @@ -125,6 +119,15 @@ public class RelatedEntity implements Serializable { return this; } + public List getInstances() { + return instances; + } + + public RelatedEntity setInstances(List instances) { + this.instances = instances; + return this; + } + public String getOfficialname() { return officialname; } @@ -250,4 +253,5 @@ public class RelatedEntity implements Serializable { this.type = type; return this; } + } \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/Tuple2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/Tuple2.java new file mode 100644 index 000000000..ab965808b --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/Tuple2.java @@ -0,0 +1,28 @@ +package eu.dnetlib.dhp.graph.model; + +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class Tuple2 { + + private Relation relation; + + private RelatedEntity relatedEntity; + + public Relation getRelation() { + return relation; + } + + public Tuple2 setRelation(Relation relation) { + this.relation = relation; + return this; + } + + public RelatedEntity getRelatedEntity() { + return relatedEntity; + } + + public Tuple2 setRelatedEntity(RelatedEntity relatedEntity) { + this.relatedEntity = relatedEntity; + return this; + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TypedRow.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/TypedRow.java similarity index 96% rename from dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TypedRow.java rename to dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/TypedRow.java index 1acbbce93..3651e28c9 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/TypedRow.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/TypedRow.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dhp.graph; +package eu.dnetlib.dhp.graph.model; import java.io.Serializable; diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ContextDef.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ContextDef.java new file mode 100644 index 000000000..05d9456f6 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ContextDef.java @@ -0,0 +1,51 @@ +package eu.dnetlib.dhp.graph.utils; + +import java.io.Serializable; + +public class ContextDef implements Serializable { + + private String id; + private String label; + private String name; + private String type; + + public ContextDef(final String id, final String label, final String name, final String type) { + super(); + this.setId(id); + this.setLabel(label); + this.setName(name); + this.setType(type); + } + + public String getLabel() { + return label; + } + + public void setLabel(final String label) { + this.label = label; + } + + public String getId() { + return id; + } + + public void setId(final String id) { + this.id = id; + } + + public String getName() { + return name; + } + + public void setName(final String name) { + this.name = name; + } + + public String getType() { + return type; + } + + public void setType(final String type) { + this.type = type; + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ContextMapper.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ContextMapper.java new file mode 100644 index 000000000..0c3a481d0 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ContextMapper.java @@ -0,0 +1,45 @@ +package eu.dnetlib.dhp.graph.utils; + +import com.google.common.base.Joiner; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.Node; +import org.dom4j.io.SAXReader; + +import java.io.Serializable; +import java.io.StringReader; +import java.util.HashMap; + +public class ContextMapper extends HashMap implements Serializable { + + private static final long serialVersionUID = 2159682308502487305L; + + private final static String XQUERY = "for $x in //RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ContextDSResourceType']//*[name()='context' or name()='category' or name()='concept'] return "; + + public static ContextMapper fromIS(final String isLookupUrl) throws DocumentException, ISLookUpException { + ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); + StringBuilder sb = new StringBuilder(""); + Joiner.on("").appendTo(sb, isLookUp.quickSearchProfile(XQUERY)); + sb.append(""); + return fromXml(sb.toString()); + } + + public static ContextMapper fromXml(final String xml) throws DocumentException { + final ContextMapper contextMapper = new ContextMapper(); + + final Document doc = new SAXReader().read(new StringReader(xml)); + for (Object o : doc.selectNodes("//entry")) { + Node node = (Node) o; + String id = node.valueOf("./@id"); + String label = node.valueOf("./@label"); + String name = node.valueOf("./@name"); + String type = node.valueOf("./@type") + ""; + + contextMapper.put(id, new ContextDef(id, label, name, type)); + } + return contextMapper; + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/GraphMappingUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/GraphMappingUtils.java new file mode 100644 index 000000000..0921fe105 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/GraphMappingUtils.java @@ -0,0 +1,254 @@ +package eu.dnetlib.dhp.graph.utils; + +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Predicate; +import com.google.common.collect.BiMap; +import com.google.common.collect.HashBiMap; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; +import com.jayway.jsonpath.DocumentContext; +import com.jayway.jsonpath.JsonPath; +import eu.dnetlib.dhp.graph.model.EntityRelEntity; +import eu.dnetlib.dhp.graph.model.RelatedEntity; +import eu.dnetlib.dhp.graph.model.TypedRow; +import eu.dnetlib.dhp.schema.oaf.*; +import net.minidev.json.JSONArray; +import org.apache.commons.lang3.StringUtils; + +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.apache.commons.lang3.StringUtils.*; + +public class GraphMappingUtils { + + public enum EntityType { + publication, dataset, otherresearchproduct, software, datasource, organization, project + } + + public enum MainEntityType { + result, datasource, organization, project + } + + public static Set authorPidTypes = Sets.newHashSet("orcid", "magidentifier"); + + public static Set instanceFieldFilter = Sets.newHashSet("instancetype", "hostedby", "license", "accessright", "collectedfrom", "dateofacceptance", "distributionlocation"); + + private static BiMap relClassMapping = HashBiMap.create(); + + static { + relClassMapping.put("isAuthorInstitutionOf", "hasAuthorInstitution"); + relClassMapping.put("isMergedIn", "merges"); + relClassMapping.put("isProducedBy", "produces"); + relClassMapping.put("hasParticipant", "isParticipant"); + relClassMapping.put("isProvidedBy", "provides"); + relClassMapping.put("isRelatedTo", "isRelatedTo"); + relClassMapping.put("isAmongTopNSimilarDocuments", "hasAmongTopNSimilarDocuments"); + relClassMapping.put("isRelatedTo", "isRelatedTo"); + relClassMapping.put("isSupplementTo", "isSupplementedBy"); + } + + public static String getInverseRelClass(final String relClass) { + String res = relClassMapping.get(relClass); + if (isNotBlank(res)) { + return res; + } + res = relClassMapping.inverse().get(relClass); + + if (isNotBlank(res)) { + return res; + } + + throw new IllegalArgumentException("unable to find an inverse relationship class for term: " + relClass); + } + + private static final String schemeTemplate = "dnet:%s_%s_relations"; + + private static Map entityMapping = Maps.newHashMap(); + + static { + entityMapping.put(EntityType.publication, MainEntityType.result); + entityMapping.put(EntityType.dataset, MainEntityType.result); + entityMapping.put(EntityType.otherresearchproduct, MainEntityType.result); + entityMapping.put(EntityType.software, MainEntityType.result); + entityMapping.put(EntityType.datasource, MainEntityType.datasource); + entityMapping.put(EntityType.organization, MainEntityType.organization); + entityMapping.put(EntityType.project, MainEntityType.project); + } + + public static String getScheme(final String sourceType, final String targetType) { + return String.format(schemeTemplate, + entityMapping.get(EntityType.valueOf(sourceType)).name(), + entityMapping.get(EntityType.valueOf(targetType)).name()); + } + + public static String getMainType(final String type) { + return entityMapping.get(EntityType.valueOf(type)).name(); + } + + public static boolean isResult(String type) { + return MainEntityType.result.name().equals(getMainType(type)); + } + + public static Predicate instanceFilter = s -> instanceFieldFilter.contains(s); + + public static EntityRelEntity asRelatedEntity(EntityRelEntity e) { + + final DocumentContext j = JsonPath.parse(e.getSource().getOaf()); + final RelatedEntity re = new RelatedEntity().setId(j.read("$.id")).setType(e.getSource().getType()); + + switch (EntityType.valueOf(e.getSource().getType())) { + case publication: + case dataset: + case otherresearchproduct: + case software: + mapTitle(j, re); + re.setDateofacceptance(j.read("$.dateofacceptance.value")); + re.setPublisher(j.read("$.publisher.value")); + + JSONArray pids = j.read("$.pid"); + re.setPid(pids.stream() + .map(p -> asStructuredProperty((LinkedHashMap) p)) + .collect(Collectors.toList())); + + re.setResulttype(asQualifier(j.read("$.resulttype"))); + + JSONArray collfrom = j.read("$.collectedfrom"); + re.setCollectedfrom(collfrom.stream() + .map(c -> asKV((LinkedHashMap) c)) + .collect(Collectors.toList())); + + // will throw exception when the instance is not found + JSONArray instances = j.read("$.instance"); + re.setInstances(instances.stream() + .map(i -> { + final LinkedHashMap p = (LinkedHashMap) i; + final Field license = new Field(); + license.setValue((String) ((LinkedHashMap) p.get("license")).get("value")); + final Instance instance = new Instance(); + instance.setLicense(license); + instance.setAccessright(asQualifier((LinkedHashMap) p.get("accessright"))); + instance.setInstancetype(asQualifier((LinkedHashMap) p.get("instancetype"))); + instance.setHostedby(asKV((LinkedHashMap) p.get("hostedby"))); + //TODO mapping of distributionlocation + instance.setCollectedfrom(asKV((LinkedHashMap) p.get("collectedfrom"))); + + Field dateofacceptance = new Field(); + dateofacceptance.setValue((String) ((LinkedHashMap) p.get("dateofacceptance")).get("value")); + instance.setDateofacceptance(dateofacceptance); + return instance; + }).collect(Collectors.toList())); + + //TODO still to be mapped + //re.setCodeRepositoryUrl(j.read("$.coderepositoryurl")); + + break; + case datasource: + re.setOfficialname(j.read("$.officialname.value")); + re.setWebsiteurl(j.read("$.websiteurl.value")); + re.setDatasourcetype(asQualifier(j.read("$.datasourcetype"))); + re.setOpenairecompatibility(asQualifier(j.read("$.openairecompatibility"))); + + break; + case organization: + re.setLegalname(j.read("$.legalname.value")); + re.setLegalshortname(j.read("$.legalshortname.value")); + re.setCountry(asQualifier(j.read("$.country"))); + + break; + case project: + re.setProjectTitle(j.read("$.title.value")); + re.setCode(j.read("$.code.value")); + re.setAcronym(j.read("$.acronym.value")); + re.setContracttype(asQualifier(j.read("$.contracttype"))); + + JSONArray f = j.read("$.fundingtree"); + if (!f.isEmpty()) { + re.setFundingtree(f.stream() + .map(s -> ((LinkedHashMap) s).get("value")) + .collect(Collectors.toList())); + } + + break; + } + return new EntityRelEntity().setSource( + new TypedRow() + .setSourceId(e.getSource().getSourceId()) + .setDeleted(e.getSource().getDeleted()) + .setType(e.getSource().getType()) + .setOaf(serialize(re))); + } + + private static KeyValue asKV(LinkedHashMap j) { + final KeyValue kv = new KeyValue(); + kv.setKey((String) j.get("key")); + kv.setValue((String) j.get("value")); + return kv; + } + + private static void mapTitle(DocumentContext j, RelatedEntity re) { + final JSONArray a = j.read("$.title"); + if (!a.isEmpty()) { + final StructuredProperty sp = asStructuredProperty((LinkedHashMap) a.get(0)); + if (StringUtils.isNotBlank(sp.getValue())) { + re.setTitle(sp); + } + } + } + + private static StructuredProperty asStructuredProperty(LinkedHashMap j) { + final StructuredProperty sp = new StructuredProperty(); + final String value = (String) j.get("value"); + if (StringUtils.isNotBlank(value)) { + sp.setValue((String) j.get("value")); + sp.setQualifier(asQualifier((LinkedHashMap) j.get("qualifier"))); + } + return sp; + } + + public static Qualifier asQualifier(LinkedHashMap j) { + final Qualifier q = new Qualifier(); + + final String classid = j.get("classid"); + if (StringUtils.isNotBlank(classid)) { + q.setClassid(classid); + } + + final String classname = j.get("classname"); + if (StringUtils.isNotBlank(classname)) { + q.setClassname(classname); + } + + final String schemeid = j.get("schemeid"); + if (StringUtils.isNotBlank(schemeid)) { + q.setSchemeid(schemeid); + } + + final String schemename = j.get("schemename"); + if (StringUtils.isNotBlank(schemename)) { + q.setSchemename(schemename); + } + return q; + } + + public static String serialize(final Object o) { + try { + return new ObjectMapper() + .setSerializationInclusion(JsonInclude.Include.NON_NULL) + .writeValueAsString(o); + } catch (JsonProcessingException e) { + throw new IllegalArgumentException("unable to serialize: " + o.toString(), e); + } + } + + public static String removePrefix(final String s) { + if (s.contains("|")) return substringAfter(s, "|"); + return s; + } + + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ISLookupClientFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ISLookupClientFactory.java new file mode 100644 index 000000000..d87f29452 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ISLookupClientFactory.java @@ -0,0 +1,24 @@ +package eu.dnetlib.dhp.graph.utils; + +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.cxf.jaxws.JaxWsProxyFactoryBean; + +public class ISLookupClientFactory { + + private static final Log log = LogFactory.getLog(ISLookupClientFactory.class); + + public static ISLookUpService getLookUpService(final String isLookupUrl) { + return getServiceStub(ISLookUpService.class, isLookupUrl); + } + + @SuppressWarnings("unchecked") + private static T getServiceStub(final Class clazz, final String endpoint) { + log.info(String.format("creating %s stub from %s", clazz.getName(), endpoint)); + final JaxWsProxyFactoryBean jaxWsProxyFactory = new JaxWsProxyFactoryBean(); + jaxWsProxyFactory.setServiceClass(clazz); + jaxWsProxyFactory.setAddress(endpoint); + return (T) jaxWsProxyFactory.create(); + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/LicenseComparator.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/LicenseComparator.java new file mode 100644 index 000000000..c4cbfadea --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/LicenseComparator.java @@ -0,0 +1,49 @@ +package eu.dnetlib.dhp.graph.utils; + +import eu.dnetlib.dhp.schema.oaf.Qualifier; + +import java.util.Comparator; + +public class LicenseComparator implements Comparator { + + @Override + public int compare(Qualifier left, Qualifier right) { + + if (left == null && right == null) return 0; + if (left == null) return 1; + if (right == null) return -1; + + String lClass = left.getClassid(); + String rClass = right.getClassid(); + + if (lClass.equals(rClass)) return 0; + + if (lClass.equals("OPEN SOURCE")) return -1; + if (rClass.equals("OPEN SOURCE")) return 1; + + if (lClass.equals("OPEN")) return -1; + if (rClass.equals("OPEN")) return 1; + + if (lClass.equals("6MONTHS")) return -1; + if (rClass.equals("6MONTHS")) return 1; + + if (lClass.equals("12MONTHS")) return -1; + if (rClass.equals("12MONTHS")) return 1; + + if (lClass.equals("EMBARGO")) return -1; + if (rClass.equals("EMBARGO")) return 1; + + if (lClass.equals("RESTRICTED")) return -1; + if (rClass.equals("RESTRICTED")) return 1; + + if (lClass.equals("CLOSED")) return -1; + if (rClass.equals("CLOSED")) return 1; + + if (lClass.equals("UNKNOWN")) return -1; + if (rClass.equals("UNKNOWN")) return 1; + + // Else (but unlikely), lexicographical ordering will do. + return lClass.compareTo(rClass); + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/StreamingInputDocumentFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/StreamingInputDocumentFactory.java new file mode 100644 index 000000000..736c9fc28 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/StreamingInputDocumentFactory.java @@ -0,0 +1,253 @@ +package eu.dnetlib.dhp.graph.utils; + +import java.io.StringReader; +import java.io.StringWriter; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import javax.xml.stream.*; +import javax.xml.stream.events.Namespace; +import javax.xml.stream.events.StartElement; +import javax.xml.stream.events.XMLEvent; + +import com.google.common.collect.Lists; +import org.apache.solr.common.SolrInputDocument; + +/** + * Optimized version of the document parser, drop in replacement of InputDocumentFactory. + * + *

+ * Faster because: + *

+ *
    + *
  • Doesn't create a DOM for the full document
  • + *
  • Doesn't execute xpaths agains the DOM
  • + *
  • Quickly serialize the 'result' element directly in a string.
  • + *
  • Uses less memory: less pressure on GC and allows more threads to process this in parallel
  • + *
+ * + *

+ * This class is fully reentrant and can be invoked in parallel. + *

+ * + * @author claudio + * + */ +public class StreamingInputDocumentFactory { + + private static final String INDEX_FIELD_PREFIX = "__"; + + private static final String DS_VERSION = INDEX_FIELD_PREFIX + "dsversion"; + + private static final String DS_ID = INDEX_FIELD_PREFIX + "dsid"; + + private static final String RESULT = "result"; + + private static final String INDEX_RESULT = INDEX_FIELD_PREFIX + RESULT; + + private static final String INDEX_RECORD_ID = INDEX_FIELD_PREFIX + "indexrecordidentifier"; + + private static final String outFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'"); + + private final static List dateFormats = Arrays.asList("yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "dd-MM-yyyy", "dd/MM/yyyy", "yyyy"); + + private static final String DEFAULTDNETRESULT = "dnetResult"; + + private static final String TARGETFIELDS = "targetFields"; + + private static final String INDEX_RECORD_ID_ELEMENT = "indexRecordIdentifier"; + + private static final String ROOT_ELEMENT = "indexRecord"; + + private static final int MAX_FIELD_LENGTH = 25000; + + private ThreadLocal inputFactory = ThreadLocal.withInitial(() -> XMLInputFactory.newInstance()); + + private ThreadLocal outputFactory = ThreadLocal.withInitial(() -> XMLOutputFactory.newInstance()); + + private ThreadLocal eventFactory = ThreadLocal.withInitial(() -> XMLEventFactory.newInstance()); + + private String version; + + private String dsId; + + private String resultName = DEFAULTDNETRESULT; + + public StreamingInputDocumentFactory(final String version, final String dsId) { + this(version, dsId, DEFAULTDNETRESULT); + } + + public StreamingInputDocumentFactory(final String version, final String dsId, final String resultName) { + this.version = version; + this.dsId = dsId; + this.resultName = resultName; + } + + public SolrInputDocument parseDocument(final String inputDocument) { + + final StringWriter results = new StringWriter(); + final List nsList = Lists.newLinkedList(); + try { + + XMLEventReader parser = inputFactory.get().createXMLEventReader(new StringReader(inputDocument)); + + final SolrInputDocument indexDocument = new SolrInputDocument(new HashMap<>()); + + while (parser.hasNext()) { + final XMLEvent event = parser.nextEvent(); + if ((event != null) && event.isStartElement()) { + final String localName = event.asStartElement().getName().getLocalPart(); + + if (ROOT_ELEMENT.equals(localName)) { + nsList.addAll(getNamespaces(event)); + } else if (INDEX_RECORD_ID_ELEMENT.equals(localName)) { + final XMLEvent text = parser.nextEvent(); + String recordId = getText(text); + indexDocument.addField(INDEX_RECORD_ID, recordId); + } else if (TARGETFIELDS.equals(localName)) { + parseTargetFields(indexDocument, parser); + } else if (resultName.equals(localName)) { + copyResult(indexDocument, results, parser, nsList, resultName); + } + } + } + + if (version != null) { + indexDocument.addField(DS_VERSION, version); + } + + if (dsId != null) { + indexDocument.addField(DS_ID, dsId); + } + + if (!indexDocument.containsKey(INDEX_RECORD_ID)) { + indexDocument.clear(); + System.err.println("missing indexrecord id:\n" + inputDocument); + } + + return indexDocument; + } catch (XMLStreamException e) { + return new SolrInputDocument(); + } + } + + private List getNamespaces(final XMLEvent event) { + final List res = Lists.newLinkedList(); + @SuppressWarnings("unchecked") + Iterator nsIter = event.asStartElement().getNamespaces(); + while (nsIter.hasNext()) { + Namespace ns = nsIter.next(); + res.add(ns); + } + return res; + } + + /** + * Parse the targetFields block and add fields to the solr document. + * + * @param indexDocument + * @param parser + * @throws XMLStreamException + */ + protected void parseTargetFields(final SolrInputDocument indexDocument, final XMLEventReader parser) throws XMLStreamException { + + boolean hasFields = false; + + while (parser.hasNext()) { + final XMLEvent targetEvent = parser.nextEvent(); + if (targetEvent.isEndElement() && targetEvent.asEndElement().getName().getLocalPart().equals(TARGETFIELDS)) { + break; + } + + if (targetEvent.isStartElement()) { + final String fieldName = targetEvent.asStartElement().getName().getLocalPart(); + final XMLEvent text = parser.nextEvent(); + + String data = getText(text); + + addField(indexDocument, fieldName, data); + hasFields = true; + } + } + + if (!hasFields) { + indexDocument.clear(); + } + } + + /** + * Copy the /indexRecord/result element and children, preserving namespace declarations etc. + * + * @param indexDocument + * @param results + * @param parser + * @param nsList + * @throws XMLStreamException + */ + protected void copyResult(final SolrInputDocument indexDocument, + final StringWriter results, + final XMLEventReader parser, + final List nsList, + final String dnetResult) throws XMLStreamException { + final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(results); + + for (Namespace ns : nsList) { + eventFactory.get().createNamespace(ns.getPrefix(), ns.getNamespaceURI()); + } + + StartElement newRecord = eventFactory.get().createStartElement("", null, RESULT, null, nsList.iterator()); + + // new root record + writer.add(newRecord); + + // copy the rest as it is + while (parser.hasNext()) { + final XMLEvent resultEvent = parser.nextEvent(); + + // TODO: replace with depth tracking instead of close tag tracking. + if (resultEvent.isEndElement() && resultEvent.asEndElement().getName().getLocalPart().equals(dnetResult)) { + writer.add(eventFactory.get().createEndElement("", null, RESULT)); + break; + } + + writer.add(resultEvent); + } + writer.close(); + indexDocument.addField(INDEX_RESULT, results.toString()); + } + + /** + * Helper used to add a field to a solr doc. It avoids to add empy fields + * + * @param indexDocument + * @param field + * @param value + */ + private final void addField(final SolrInputDocument indexDocument, final String field, final String value) { + String cleaned = value.trim(); + if (!cleaned.isEmpty()) { + // log.info("\n\n adding field " + field.toLowerCase() + " value: " + cleaned + "\n"); + indexDocument.addField(field.toLowerCase(), cleaned); + } + } + + /** + * Helper used to get the string from a text element. + * + * @param text + * @return the + */ + protected final String getText(final XMLEvent text) { + if (text.isEndElement()) // log.warn("skipping because isEndOfElement " + text.asEndElement().getName().getLocalPart()); + return ""; + + final String data = text.asCharacters().getData(); + if (data != null && data.length() > MAX_FIELD_LENGTH) { + return data.substring(0, MAX_FIELD_LENGTH); + } + + return data; + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/TemplateFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/TemplateFactory.java new file mode 100644 index 000000000..27c55fab7 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/TemplateFactory.java @@ -0,0 +1,107 @@ +package eu.dnetlib.dhp.graph.utils; + +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import org.apache.commons.lang3.StringUtils; +import org.stringtemplate.v4.ST; + +import java.io.IOException; +import java.util.Collection; +import java.util.List; +import java.util.stream.Collectors; + +import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.removePrefix; +import static eu.dnetlib.dhp.graph.utils.XmlSerializationUtils.escapeXml; + +public class TemplateFactory { + + private TemplateResources resources; + + private final static char DELIMITER = '$'; + + public TemplateFactory() { + try { + resources = new TemplateResources(); + } catch (IOException e) { + throw new IllegalStateException(e); + } + } + + public String buildBody(final String type, final List metadata, final List rels, final List children, final List extraInfo) { + ST body = getTemplate(resources.getEntity()); + + body.add("name", type); + body.add("metadata", metadata); + body.add("rels", rels); + body.add("children", children); + body.add("extrainfo", extraInfo); + + return body.render(); + } + + public String getChild(final String name, final String id, final List metadata) { + return getTemplate(resources.getChild()) + .add("name", name) + .add("hasId", !(id == null)) + .add("id", id != null ? escapeXml(removePrefix(id)) : "") + .add("metadata", metadata) + .render(); + } + + public String buildRecord( + final OafEntity entity, + final String schemaLocation, + final String body) { + return getTemplate(resources.getRecord()) + .add("id", escapeXml(removePrefix(entity.getId()))) + .add("dateofcollection", entity.getDateofcollection()) + .add("dateoftransformation", entity.getDateoftransformation()) + .add("schemaLocation", schemaLocation) + .add("it", body) + .render(); + } + + public String getRel(final String type, + final String objIdentifier, + final Collection fields, + final String semanticclass, + final String semantischeme, + final DataInfo info) { + return getTemplate(resources.getRel()) + .add("type", type) + .add("objIdentifier", escapeXml(removePrefix(objIdentifier))) + .add("class", semanticclass) + .add("scheme", semantischeme) + .add("metadata", fields) + .add("inferred", info.getInferred()) + .add("trust", info.getTrust()) + .add("inferenceprovenance", info.getInferenceprovenance()) + .add("provenanceaction", info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : "") + .render(); + } + + public String getInstance(final String resultId, final List instancemetadata, final List webresources) { + return getTemplate(resources.getInstance()) + .add("instanceId", escapeXml(removePrefix(resultId))) + .add("metadata", instancemetadata) + .add("webresources", webresources + .stream() + .filter(StringUtils::isNotBlank) + .map(w -> getWebResource(w)) + .collect(Collectors.toList())) + .render(); + } + + private String getWebResource(final String identifier) { + return getTemplate(resources.getWebresource()) + .add("identifier", escapeXml(identifier)) + .render(); + } + + // HELPERS + + private ST getTemplate(final String res) { + return new ST(res, DELIMITER, DELIMITER); + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/TemplateResources.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/TemplateResources.java new file mode 100644 index 000000000..92aaedfd3 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/TemplateResources.java @@ -0,0 +1,54 @@ +package eu.dnetlib.dhp.graph.utils; + +import com.google.common.io.Resources; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; + +public class TemplateResources { + + private String record = read("eu/dnetlib/dhp/graph/template/record.st"); + + private String instance = read("eu/dnetlib/dhp/graph/template/instance.st"); + + private String rel = read("eu/dnetlib/dhp/graph/template/rel.st"); + + private String webresource = read("eu/dnetlib/dhp/graph/template/webresource.st"); + + private String child = read("eu/dnetlib/dhp/graph/template/child.st"); + + private String entity = read("eu/dnetlib/dhp/graph/template/entity.st"); + + private static String read(final String classpathResource) throws IOException { + return Resources.toString(Resources.getResource(classpathResource), StandardCharsets.UTF_8); + } + + public TemplateResources() throws IOException { + + } + + public String getEntity() { + return entity; + } + + public String getRecord() { + return record; + } + + public String getInstance() { + return instance; + } + + public String getRel() { + return rel; + } + + public String getWebresource() { + return webresource; + } + + public String getChild() { + return child; + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlRecordFactory.java new file mode 100644 index 000000000..bd4f8ec6c --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlRecordFactory.java @@ -0,0 +1,962 @@ +package eu.dnetlib.dhp.graph.utils; + +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; +import com.mycila.xmltool.XMLDoc; +import com.mycila.xmltool.XMLTag; +import eu.dnetlib.dhp.graph.model.JoinedEntity; +import eu.dnetlib.dhp.graph.model.RelatedEntity; +import eu.dnetlib.dhp.graph.model.Tuple2; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.*; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.Element; +import org.dom4j.Node; +import org.dom4j.io.OutputFormat; +import org.dom4j.io.SAXReader; +import org.dom4j.io.XMLWriter; + +import javax.xml.transform.*; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import java.io.IOException; +import java.io.Serializable; +import java.io.StringReader; +import java.io.StringWriter; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.*; +import static eu.dnetlib.dhp.graph.utils.XmlSerializationUtils.*; +import static org.apache.commons.lang3.StringUtils.isNotBlank; +import static org.apache.commons.lang3.StringUtils.substringBefore; + +public class XmlRecordFactory implements Serializable { + + private Set specialDatasourceTypes; + + private ContextMapper contextMapper; + + private String schemaLocation; + + private Set contextes = Sets.newHashSet(); + + private boolean indent = false; + + public XmlRecordFactory( + final ContextMapper contextMapper, final boolean indent, + final String schemaLocation, final Set otherDatasourceTypesUForUI) { + + this.contextMapper = contextMapper; + this.schemaLocation = schemaLocation; + this.specialDatasourceTypes = otherDatasourceTypesUForUI; + + this.indent = indent; + } + + public String build(final JoinedEntity je) { + final OafEntity entity = je.getEntity(); + TemplateFactory templateFactory = new TemplateFactory(); + try { + final List metadata = metadata(je.getType(), entity); + + // rels has to be processed before the contexts because they enrich the contextMap with the funding info. + final List relations = listRelations(je, templateFactory); + + metadata.addAll(buildContexts(getMainType(je.getType()))); + metadata.add(parseDataInfo(entity.getDataInfo())); + + final String body = templateFactory.buildBody( + getMainType(je.getType()), + metadata, + relations, + listChildren(je, templateFactory), listExtraInfo(je)); + + return printXML(templateFactory.buildRecord(entity, schemaLocation, body), indent); + } catch (final Throwable e) { + throw new RuntimeException(String.format("error building record '%s'", entity.getId()), e); + } + } + + private String printXML(String xml, boolean indent) { + try { + final Document doc = new SAXReader().read(new StringReader(xml)); + OutputFormat format = indent ? OutputFormat.createPrettyPrint() : OutputFormat.createCompactFormat(); + format.setExpandEmptyElements(false); + format.setSuppressDeclaration(true); + StringWriter sw = new StringWriter(); + XMLWriter writer = new XMLWriter(sw, format); + writer.write(doc); + return sw.toString(); + } catch (IOException | DocumentException e) { + throw new IllegalArgumentException("Unable to indent XML. Invalid record:\n" + xml, e); + } + } + + private List metadata(final String type, final OafEntity entity) { + + final List metadata = Lists.newArrayList(); + + if (entity.getCollectedfrom() != null) { + metadata.addAll(entity.getCollectedfrom() + .stream() + .map(kv -> mapKeyValue("collectedfrom", kv)) + .collect(Collectors.toList())); + } + if (entity.getOriginalId() != null) { + metadata.addAll(entity.getOriginalId() + .stream() + .map(s -> asXmlElement("originalId", s)) + .collect(Collectors.toList())); + } + if (entity.getPid() != null) { + metadata.addAll(entity.getPid() + .stream() + .map(p -> mapStructuredProperty("pid", p)) + .collect(Collectors.toList())); + } + + if (GraphMappingUtils.isResult(type)) { + final Result r = (Result) entity; + + if (r.getTitle() != null) { + metadata.addAll(r.getTitle() + .stream() + .map(t -> mapStructuredProperty("title", t)) + .collect(Collectors.toList())); + } + if (r.getAuthor() != null) { + metadata.addAll(r.getAuthor() + .stream() + .map(a -> { + final StringBuilder sb = new StringBuilder(" isNotBlank(sp.getQualifier().getClassid()) && isNotBlank(sp.getValue())) + .forEach(sp -> { + String pidType = escapeXml(sp.getQualifier().getClassid()).replaceAll("\\W", ""); + String pidValue = escapeXml(sp.getValue()); + + // ugly hack: some records provide swapped pidtype and pidvalue + if (authorPidTypes.contains(pidValue.toLowerCase().trim())) { + sb.append(String.format(" %s=\"%s\"", pidValue, pidType)); + } else { + pidType = pidType.replaceAll("\\W", "").replaceAll("\\d", ""); + if (isNotBlank(pidType)) { + sb.append(String.format(" %s=\"%s\"", + pidType, + pidValue.toLowerCase().replaceAll("orcid", ""))); + } + } + }); + } + sb.append(">" + escapeXml(a.getFullname()) + ""); + return sb.toString(); + }).collect(Collectors.toList())); + } + if (r.getContributor() != null) { + metadata.addAll(r.getContributor() + .stream() + .map(c -> asXmlElement("contributor", c.getValue())) + .collect(Collectors.toList())); + } + if (r.getCountry() != null) { + metadata.addAll(r.getCountry() + .stream() + .map(c -> mapQualifier("country", c)) + .collect(Collectors.toList())); + } + if (r.getCoverage() != null) { + metadata.addAll(r.getCoverage() + .stream() + .map(c -> asXmlElement("coverage", c.getValue())) + .collect(Collectors.toList())); + } + if (r.getDateofacceptance() != null) { + metadata.add(asXmlElement("dateofacceptance", r.getDateofacceptance().getValue())); + } + if (r.getDescription() != null) { + metadata.addAll(r.getDescription() + .stream() + .map(c -> asXmlElement("description", c.getValue())) + .collect(Collectors.toList())); + } + if (r.getEmbargoenddate() != null) { + metadata.add(asXmlElement("embargoenddate", r.getEmbargoenddate().getValue())); + } + if (r.getSubject() != null) { + metadata.addAll(r.getSubject() + .stream() + .map(s -> mapStructuredProperty("subject", s)) + .collect(Collectors.toList())); + } + if (r.getLanguage() != null) { + metadata.add(mapQualifier("language", r.getLanguage())); + } + if (r.getRelevantdate() != null) { + metadata.addAll(r.getRelevantdate() + .stream() + .map(s -> mapStructuredProperty("relevantdate", s)) + .collect(Collectors.toList())); + } + if (r.getPublisher() != null) { + metadata.add(asXmlElement("publisher", r.getPublisher().getValue())); + } + if (r.getSource() != null) { + metadata.addAll(r.getSource() + .stream() + .map(c -> asXmlElement("source", c.getValue())) + .collect(Collectors.toList())); + } + if (r.getFormat() != null) { + metadata.addAll(r.getFormat() + .stream() + .map(c -> asXmlElement("format", c.getValue())) + .collect(Collectors.toList())); + } + if (r.getResulttype() != null) { + metadata.add(mapQualifier("resulttype", r.getResulttype())); + } + if (r.getResourcetype() != null) { + metadata.add(mapQualifier("resourcetype", r.getResourcetype())); + } + if (r.getRefereed() != null) { + metadata.add(asXmlElement("refereed", r.getRefereed().getValue())); + } + if (r.getProcessingchargeamount() != null) { + metadata.add(asXmlElement("processingchargeamount", r.getProcessingchargeamount().getValue())); + } + if (r.getProcessingchargecurrency() != null) { + metadata.add(asXmlElement("processingchargecurrency", r.getProcessingchargecurrency().getValue())); + } + + metadata.add(mapQualifier("bestaccessright", getBestAccessright(r))); + + if (r.getContext() != null) { + contextes.addAll(r.getContext() + .stream() + .map(c -> c.getId()) + .collect(Collectors.toList())); + if (contextes.contains("dh-ch::subcommunity::2")) { + contextes.add("clarin"); + } + } + } + + switch (EntityType.valueOf(type)) { + case publication: + final Publication pub = (Publication) entity; + + if (pub.getJournal() != null) { + final Journal j = pub.getJournal(); + metadata.add(mapJournal(j)); + } + + break; + case dataset: + final Dataset d = (Dataset) entity; + if (d.getDevice() != null) { + metadata.add(asXmlElement("device", d.getDevice().getValue())); + } + if (d.getLastmetadataupdate() != null) { + metadata.add(asXmlElement("lastmetadataupdate", d.getLastmetadataupdate().getValue())); + } + if (d.getMetadataversionnumber() != null) { + metadata.add(asXmlElement("metadataversionnumber", d.getMetadataversionnumber().getValue())); + } + if (d.getSize() != null) { + metadata.add(asXmlElement("size", d.getSize().getValue())); + } + if (d.getStoragedate() != null) { + metadata.add(asXmlElement("storagedate", d.getStoragedate().getValue())); + } + if (d.getVersion() != null) { + metadata.add(asXmlElement("version", d.getVersion().getValue())); + } + //TODO d.getGeolocation() + + break; + case otherresearchproduct: + final OtherResearchProduct orp = (OtherResearchProduct) entity; + + if (orp.getContactperson() != null) { + metadata.addAll(orp.getContactperson() + .stream() + .map(c -> asXmlElement("contactperson", c.getValue())) + .collect(Collectors.toList())); + } + + if (orp.getContactgroup() != null) { + metadata.addAll(orp.getContactgroup() + .stream() + .map(c -> asXmlElement("contactgroup", c.getValue())) + .collect(Collectors.toList())); + } + if (orp.getTool() != null) { + metadata.addAll(orp.getTool() + .stream() + .map(c -> asXmlElement("tool", c.getValue())) + .collect(Collectors.toList())); + } + break; + case software: + final Software s = (Software) entity; + + if (s.getDocumentationUrl() != null) { + metadata.addAll(s.getDocumentationUrl() + .stream() + .map(c -> asXmlElement("documentationUrl", c.getValue())) + .collect(Collectors.toList())); + } + if (s.getLicense() != null) { + metadata.addAll(s.getLicense() + .stream() + .map(l -> mapStructuredProperty("license", l)) + .collect(Collectors.toList())); + } + if (s.getCodeRepositoryUrl() != null) { + metadata.add(asXmlElement("codeRepositoryUrl", s.getCodeRepositoryUrl().getValue())); + } + if (s.getProgrammingLanguage() != null) { + metadata.add(mapQualifier("programmingLanguage", s.getProgrammingLanguage())); + } + break; + case datasource: + final Datasource ds = (Datasource) entity; + + if (ds.getDatasourcetype() != null) { + mapDatasourceType(metadata, ds.getDatasourcetype()); + } + if (ds.getOpenairecompatibility() != null) { + metadata.add(mapQualifier("openairecompatibility", ds.getOpenairecompatibility())); + } + if (ds.getOfficialname() != null) { + metadata.add(asXmlElement("officialname", ds.getOfficialname().getValue())); + } + if (ds.getEnglishname() != null) { + metadata.add(asXmlElement("englishname", ds.getEnglishname().getValue())); + } + if (ds.getWebsiteurl() != null) { + metadata.add(asXmlElement("websiteurl", ds.getWebsiteurl().getValue())); + } + if (ds.getLogourl() != null) { + metadata.add(asXmlElement("logourl", ds.getLogourl().getValue())); + } + if (ds.getContactemail() != null) { + metadata.add(asXmlElement("contactemail", ds.getContactemail().getValue())); + } + if (ds.getNamespaceprefix() != null) { + metadata.add(asXmlElement("namespaceprefix", ds.getNamespaceprefix().getValue())); + } + if (ds.getLatitude() != null) { + metadata.add(asXmlElement("latitude", ds.getLatitude().getValue())); + } + if (ds.getLongitude() != null) { + metadata.add(asXmlElement("longitude", ds.getLongitude().getValue())); + } + if (ds.getDateofvalidation() != null) { + metadata.add(asXmlElement("dateofvalidation", ds.getDateofvalidation().getValue())); + } + if (ds.getDescription() != null) { + metadata.add(asXmlElement("description", ds.getDescription().getValue())); + } + if (ds.getOdnumberofitems() != null) { + metadata.add(asXmlElement("odnumberofitems", ds.getOdnumberofitems().getValue())); + } + if (ds.getOdnumberofitemsdate() != null) { + metadata.add(asXmlElement("odnumberofitemsdate", ds.getOdnumberofitemsdate().getValue())); + } + if (ds.getOdpolicies() != null) { + metadata.add(asXmlElement("odpolicies", ds.getOdpolicies().getValue())); + } + if (ds.getOdlanguages() != null) { + metadata.addAll(ds.getOdlanguages() + .stream() + .map(c -> asXmlElement("odlanguages", c.getValue())) + .collect(Collectors.toList())); + } + if (ds.getOdcontenttypes() != null) { + metadata.addAll(ds.getOdcontenttypes() + .stream() + .map(c -> asXmlElement("odcontenttypes", c.getValue())) + .collect(Collectors.toList())); + } + if (ds.getAccessinfopackage() != null) { + metadata.addAll(ds.getAccessinfopackage() + .stream() + .map(c -> asXmlElement("accessinfopackage", c.getValue())) + .collect(Collectors.toList())); + } + if (ds.getReleaseenddate() != null) { + metadata.add(asXmlElement("releasestartdate", ds.getReleaseenddate().getValue())); + } + if (ds.getReleaseenddate() != null) { + metadata.add(asXmlElement("releaseenddate", ds.getReleaseenddate().getValue())); + } + if (ds.getMissionstatementurl() != null) { + metadata.add(asXmlElement("missionstatementurl", ds.getMissionstatementurl().getValue())); + } + if (ds.getDataprovider() != null) { + metadata.add(asXmlElement("dataprovider", ds.getDataprovider().getValue().toString())); + } + if (ds.getServiceprovider() != null) { + metadata.add(asXmlElement("serviceprovider", ds.getServiceprovider().getValue().toString())); + } + if (ds.getDatabaseaccesstype() != null) { + metadata.add(asXmlElement("databaseaccesstype", ds.getDatabaseaccesstype().getValue())); + } + if (ds.getDatauploadtype() != null) { + metadata.add(asXmlElement("datauploadtype", ds.getDatauploadtype().getValue())); + } + if (ds.getDatabaseaccessrestriction() != null) { + metadata.add(asXmlElement("databaseaccessrestriction", ds.getDatabaseaccessrestriction().getValue())); + } + if (ds.getDatauploadrestriction() != null) { + metadata.add(asXmlElement("datauploadrestriction", ds.getDatauploadrestriction().getValue())); + } + if (ds.getVersioning() != null) { + metadata.add(asXmlElement("versioning", ds.getVersioning().getValue().toString())); + } + if (ds.getCitationguidelineurl() != null) { + metadata.add(asXmlElement("citationguidelineurl", ds.getCitationguidelineurl().getValue())); + } + if (ds.getQualitymanagementkind() != null) { + metadata.add(asXmlElement("qualitymanagementkind", ds.getQualitymanagementkind().getValue())); + } + if (ds.getPidsystems() != null) { + metadata.add(asXmlElement("pidsystems", ds.getPidsystems().getValue())); + } + if (ds.getCertificates() != null) { + metadata.add(asXmlElement("certificates", ds.getCertificates().getValue())); + } + if (ds.getPolicies() != null) { + metadata.addAll(ds.getPolicies() + .stream() + .map(kv -> mapKeyValue("policies", kv)) + .collect(Collectors.toList())); + } + if (ds.getJournal() != null) { + metadata.add(mapJournal(ds.getJournal())); + } + if (ds.getSubjects() != null) { + metadata.addAll(ds.getSubjects() + .stream() + .map(sp -> mapStructuredProperty("subject", sp)) + .collect(Collectors.toList())); + } + + break; + case organization: + final Organization o = (Organization) entity; + + if (o.getLegalshortname() != null) { + metadata.add(asXmlElement("legalshortname", o.getLegalshortname().getValue())); + } + if (o.getLegalname() != null) { + metadata.add(asXmlElement("legalname", o.getLegalname().getValue())); + } + if (o.getAlternativeNames() != null) { + metadata.addAll(o.getAlternativeNames() + .stream() + .map(c -> asXmlElement("alternativeNames", c.getValue())) + .collect(Collectors.toList())); + } + if (o.getWebsiteurl() != null) { + metadata.add(asXmlElement("websiteurl", o.getWebsiteurl().getValue())); + } + if (o.getLogourl() != null) { + metadata.add(asXmlElement("websiteurl", o.getLogourl().getValue())); + } + + if (o.getEclegalbody() != null) { + metadata.add(asXmlElement("eclegalbody", o.getEclegalbody().getValue())); + } + if (o.getEclegalperson() != null) { + metadata.add(asXmlElement("eclegalperson", o.getEclegalperson().getValue())); + } + if (o.getEcnonprofit() != null) { + metadata.add(asXmlElement("ecnonprofit", o.getEcnonprofit().getValue())); + } + if (o.getEcresearchorganization() != null) { + metadata.add(asXmlElement("ecresearchorganization", o.getEcresearchorganization().getValue())); + } + if (o.getEchighereducation() != null) { + metadata.add(asXmlElement("echighereducation", o.getEchighereducation().getValue())); + } + if (o.getEcinternationalorganization() != null) { + metadata.add(asXmlElement("ecinternationalorganizationeurinterests", o.getEcinternationalorganization().getValue())); + } + if (o.getEcinternationalorganization() != null) { + metadata.add(asXmlElement("ecinternationalorganization", o.getEcinternationalorganization().getValue())); + } + if (o.getEcenterprise() != null) { + metadata.add(asXmlElement("ecenterprise", o.getEcenterprise().getValue())); + } + if (o.getEcsmevalidated() != null) { + metadata.add(asXmlElement("ecsmevalidated", o.getEcsmevalidated().getValue())); + } + if (o.getEcnutscode() != null) { + metadata.add(asXmlElement("ecnutscode", o.getEcnutscode().getValue())); + } + if (o.getCountry() != null) { + metadata.add(mapQualifier("country", o.getCountry())); + } + + break; + case project: + + final Project p = (Project) entity; + + if (p.getWebsiteurl() != null) { + metadata.add(asXmlElement("websiteurl", p.getWebsiteurl().getValue())); + } + if (p.getCode() != null) { + metadata.add(asXmlElement("code", p.getCode().getValue())); + } + if (p.getAcronym() != null) { + metadata.add(asXmlElement("acronym", p.getAcronym().getValue())); + } + if (p.getTitle() != null) { + metadata.add(asXmlElement("title", p.getTitle().getValue())); + } + if (p.getStartdate() != null) { + metadata.add(asXmlElement("startdate", p.getStartdate().getValue())); + } + if (p.getEnddate() != null) { + metadata.add(asXmlElement("enddate", p.getEnddate().getValue())); + } + if (p.getCallidentifier() != null) { + metadata.add(asXmlElement("callidentifier", p.getCallidentifier().getValue())); + } + if (p.getKeywords() != null) { + metadata.add(asXmlElement("keywords", p.getKeywords().getValue())); + } + if (p.getDuration() != null) { + metadata.add(asXmlElement("duration", p.getDuration().getValue())); + } + if (p.getEcsc39() != null) { + metadata.add(asXmlElement("ecsc39", p.getEcsc39().getValue())); + } + if (p.getEcarticle29_3() != null) { + metadata.add(asXmlElement("ecarticle29_3", p.getEcarticle29_3().getValue())); + } + if (p.getSubjects() != null) { + metadata.addAll(p.getSubjects() + .stream() + .map(sp -> mapStructuredProperty("subject", sp)) + .collect(Collectors.toList())); + } + if (p.getContracttype() != null) { + metadata.add(mapQualifier("contracttype", p.getContracttype())); + } + if (p.getEcsc39() != null) { + metadata.add(asXmlElement("ecsc39", p.getEcsc39().getValue())); + } + if (p.getContactfullname() != null) { + metadata.add(asXmlElement("contactfullname", p.getContactfullname().getValue())); + } + if (p.getContactfax() != null) { + metadata.add(asXmlElement("contactfax", p.getContactfax().getValue())); + } + if (p.getContactphone() != null) { + metadata.add(asXmlElement("contactphone", p.getContactphone().getValue())); + } + if (p.getContactemail() != null) { + metadata.add(asXmlElement("contactemail", p.getContactemail().getValue())); + } + if (p.getSummary() != null) { + metadata.add(asXmlElement("summary", p.getSummary().getValue())); + } + if (p.getCurrency() != null) { + metadata.add(asXmlElement("currency", p.getCurrency().getValue())); + } + if (p.getTotalcost() != null) { + metadata.add(asXmlElement("totalcost", p.getTotalcost().toString())); + } + if (p.getFundedamount() != null) { + metadata.add(asXmlElement("fundedamount", p.getFundedamount().toString())); + } + if (p.getFundingtree() != null) { + metadata.addAll(p.getFundingtree() + .stream() + .map(ft -> asXmlElement("fundingtree", ft.getValue())) + .collect(Collectors.toList())); + } + + break; + default: + throw new IllegalArgumentException("invalid entity type: " + type); + } + + return metadata; + } + + private void mapDatasourceType(List metadata, final Qualifier dsType) { + metadata.add(mapQualifier("datasourcetype", dsType)); + + if (specialDatasourceTypes.contains(dsType.getClassid())) { + dsType.setClassid("other"); + dsType.setClassname("other"); + } + metadata.add(mapQualifier("datasourcetypeui", dsType)); + } + + private Qualifier getBestAccessright(final Result r) { + Qualifier bestAccessRight = new Qualifier(); + bestAccessRight.setClassid("UNKNOWN"); + bestAccessRight.setClassname("not available"); + bestAccessRight.setSchemeid("dnet:access_modes"); + bestAccessRight.setSchemename("dnet:access_modes"); + + final LicenseComparator lc = new LicenseComparator(); + for (final Instance instance : r.getInstance()) { + if (lc.compare(bestAccessRight, instance.getAccessright()) > 0) { + bestAccessRight = instance.getAccessright(); + } + } + return bestAccessRight; + } + + private List listRelations(final JoinedEntity je, TemplateFactory templateFactory) { + final List rels = Lists.newArrayList(); + + for (final Tuple2 link : je.getLinks()) { + + final Relation rel = link.getRelation(); + final RelatedEntity re = link.getRelatedEntity(); + final String targetType = link.getRelatedEntity().getType(); + + final List metadata = Lists.newArrayList(); + switch (EntityType.valueOf(targetType)) { + case publication: + case dataset: + case otherresearchproduct: + case software: + if (re.getTitle() != null && isNotBlank(re.getTitle().getValue())) { + metadata.add(mapStructuredProperty("title", re.getTitle())); + } + if (isNotBlank(re.getDateofacceptance())) { + metadata.add(asXmlElement("dateofacceptance", re.getDateofacceptance())); + } + if (isNotBlank(re.getPublisher())) { + metadata.add(asXmlElement("publisher", re.getPublisher())); + } + if (isNotBlank(re.getCodeRepositoryUrl())) { + metadata.add(asXmlElement("coderepositoryurl", re.getCodeRepositoryUrl())); + } + if (re.getResulttype() != null & !re.getResulttype().isBlank()) { + metadata.add(mapQualifier("resulttype", re.getResulttype())); + } + if (re.getCollectedfrom() != null) { + metadata.addAll(re.getCollectedfrom() + .stream() + .map(kv -> mapKeyValue("collectedfrom", kv)) + .collect(Collectors.toList())); + } + if (re.getPid() != null) { + metadata.addAll(re.getPid() + .stream() + .map(p -> mapStructuredProperty("pid", p)) + .collect(Collectors.toList())); + } + break; + case datasource: + if (isNotBlank(re.getOfficialname())) { + metadata.add(asXmlElement("officialname", re.getOfficialname())); + } + if (re.getDatasourcetype() != null & !re.getDatasourcetype().isBlank()) { + mapDatasourceType(metadata, re.getDatasourcetype()); + } + if (re.getOpenairecompatibility() != null & !re.getOpenairecompatibility().isBlank()) { + metadata.add(mapQualifier("openairecompatibility", re.getOpenairecompatibility())); + } + break; + case organization: + if (isNotBlank(re.getLegalname())) { + metadata.add(asXmlElement("legalname", re.getLegalname())); + } + if (isNotBlank(re.getLegalshortname())) { + metadata.add(asXmlElement("legalshortname", re.getLegalshortname())); + } + if (re.getCountry() != null & !re.getCountry().isBlank()) { + metadata.add(mapQualifier("country", re.getCountry())); + } + break; + case project: + if (isNotBlank(re.getProjectTitle())) { + metadata.add(asXmlElement("title", re.getProjectTitle())); + } + if (isNotBlank(re.getCode())) { + metadata.add(asXmlElement("code", re.getCode())); + } + if (isNotBlank(re.getAcronym())) { + metadata.add(asXmlElement("acronym", re.getAcronym())); + } + if (re.getContracttype() != null & !re.getContracttype().isBlank()) { + metadata.add(mapQualifier("contracttype", re.getContracttype())); + } + if (re.getFundingtree() != null) { + metadata.addAll(re.getFundingtree() + .stream() + .peek(ft -> fillContextMap(ft)) + .map(ft -> getRelFundingTree(ft)) + .collect(Collectors.toList())); + } + break; + default: + throw new IllegalArgumentException("invalid target type: " + targetType); + + } + final DataInfo info = rel.getDataInfo(); + + rels.add(templateFactory.getRel( + targetType, + rel.getTarget(), + Sets.newHashSet(metadata), + getInverseRelClass(rel.getRelClass()), + getScheme(targetType, re.getType()), + info)); + } + return rels; + } + + private List listChildren(final JoinedEntity je, TemplateFactory templateFactory) { + + final List children = Lists.newArrayList(); + + if (MainEntityType.result.toString().equals(getMainType(je.getType()))) { + final List instances = ((Result) je.getEntity()).getInstance(); + if (instances != null) { + for (final Instance instance : ((Result) je.getEntity()).getInstance()) { + + final List fields = Lists.newArrayList(); + + if (instance.getAccessright() != null && !instance.getAccessright().isBlank()) { + fields.add(mapQualifier("accessright", instance.getAccessright())); + } + if (instance.getCollectedfrom() != null) { + fields.add(mapKeyValue("collectedfrom", instance.getCollectedfrom())); + } + if (instance.getHostedby() != null) { + fields.add(mapKeyValue("hostedby", instance.getHostedby())); + } + if (instance.getDateofacceptance() != null && isNotBlank(instance.getDateofacceptance().getValue())) { + fields.add(asXmlElement("dateofacceptance", instance.getDateofacceptance().getValue())); + } + if (instance.getInstancetype() != null && !instance.getInstancetype().isBlank()) { + fields.add(mapQualifier("instancetype", instance.getInstancetype())); + } + if (isNotBlank(instance.getDistributionlocation())) { + fields.add(asXmlElement("distributionlocation", instance.getDistributionlocation())); + } + + children.add(templateFactory.getInstance(instance.getHostedby().getKey(), fields, instance.getUrl())); + } + } + final List ext = ((Result) je.getEntity()).getExternalReference(); + if (ext != null) { + for (final ExternalReference er : ((Result) je.getEntity()).getExternalReference()) { + + final List fields = Lists.newArrayList(); + + if (isNotBlank(er.getSitename())) { + fields.add(asXmlElement("sitename", er.getSitename())); + } + if (isNotBlank(er.getLabel())) { + fields.add(asXmlElement("label", er.getLabel())); + } + if (isNotBlank(er.getUrl())) { + fields.add(asXmlElement("url", er.getUrl())); + } + if (isNotBlank(er.getDescription())) { + fields.add(asXmlElement("description", er.getDescription())); + } + if (isNotBlank(er.getUrl())) { + fields.add(mapQualifier("qualifier", er.getQualifier())); + } + if (isNotBlank(er.getRefidentifier())) { + fields.add(asXmlElement("refidentifier", er.getRefidentifier())); + } + if (isNotBlank(er.getQuery())) { + fields.add(asXmlElement("query", er.getQuery())); + } + + children.add(templateFactory.getChild("externalreference", null, fields)); + } + } + } + + return children; + } + + private List listExtraInfo(JoinedEntity je) { + final List extraInfo = je.getEntity().getExtraInfo(); + return extraInfo != null ? extraInfo + .stream() + .map(e -> mapExtraInfo(e)) + .collect(Collectors.toList()) : Lists.newArrayList(); + } + + private List buildContexts(final String type) { + final List res = Lists.newArrayList(); + + if ((contextMapper != null) && !contextMapper.isEmpty() && MainEntityType.result.toString().equals(type)) { + + XMLTag document = XMLDoc.newDocument(true).addRoot("contextRoot"); + + for (final String context : contextes) { + + String id = ""; + for (final String token : Splitter.on("::").split(context)) { + id += token; + + final ContextDef def = contextMapper.get(id); + + if (def == null) { + continue; + // throw new IllegalStateException(String.format("cannot find context for id '%s'", id)); + } + + if (def.getName().equals("context")) { + final String xpath = "//context/@id='" + def.getId() + "'"; + if (!document.gotoRoot().rawXpathBoolean(xpath, new Object())) { + document = addContextDef(document.gotoRoot(), def); + } + } + + if (def.getName().equals("category")) { + final String rootId = substringBefore(def.getId(), "::"); + document = addContextDef(document.gotoRoot().gotoTag("//context[./@id='" + rootId + "']", new Object()), def); + } + + if (def.getName().equals("concept")) { + document = addContextDef(document, def).gotoParent(); + } + id += "::"; + } + } + final Transformer transformer = getTransformer(); + for (final org.w3c.dom.Element x : document.gotoRoot().getChildElement()) { + try { + res.add(asStringElement(x, transformer)); + } catch (final TransformerException e) { + throw new RuntimeException(e); + } + } + } + + return res; + } + + private Transformer getTransformer() { + try { + Transformer transformer = TransformerFactory.newInstance().newTransformer(); + transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + return transformer; + } catch (TransformerConfigurationException e) { + throw new IllegalStateException("unable to create javax.xml.transform.Transformer", e); + } + } + + private XMLTag addContextDef(final XMLTag tag, final ContextDef def) { + tag.addTag(def.getName()).addAttribute("id", def.getId()).addAttribute("label", def.getLabel()); + if ((def.getType() != null) && !def.getType().isEmpty()) { + tag.addAttribute("type", def.getType()); + } + return tag; + } + + private String asStringElement(final org.w3c.dom.Element element, final Transformer transformer) throws TransformerException { + final StringWriter buffer = new StringWriter(); + transformer.transform(new DOMSource(element), new StreamResult(buffer)); + return buffer.toString(); + } + + private void fillContextMap(final String xmlTree) { + + Document fundingPath; + try { + fundingPath = new SAXReader().read(new StringReader(xmlTree)); + } catch (final DocumentException e) { + throw new RuntimeException(e); + } + try { + final Node funder = fundingPath.selectSingleNode("//funder"); + + if (funder != null) { + + final String funderShortName = funder.valueOf("./shortname"); + contextes.add(funderShortName); + + contextMapper.put(funderShortName, new ContextDef(funderShortName, funder.valueOf("./name"), "context", "funding")); + final Node level0 = fundingPath.selectSingleNode("//funding_level_0"); + if (level0 != null) { + final String level0Id = Joiner.on("::").join(funderShortName, level0.valueOf("./name")); + contextMapper.put(level0Id, new ContextDef(level0Id, level0.valueOf("./description"), "category", "")); + final Node level1 = fundingPath.selectSingleNode("//funding_level_1"); + if (level1 == null) { + contextes.add(level0Id); + } else { + final String level1Id = Joiner.on("::").join(level0Id, level1.valueOf("./name")); + contextMapper.put(level1Id, new ContextDef(level1Id, level1.valueOf("./description"), "concept", "")); + final Node level2 = fundingPath.selectSingleNode("//funding_level_2"); + if (level2 == null) { + contextes.add(level1Id); + } else { + final String level2Id = Joiner.on("::").join(level1Id, level2.valueOf("./name")); + contextMapper.put(level2Id, new ContextDef(level2Id, level2.valueOf("./description"), "concept", "")); + contextes.add(level2Id); + } + } + } + } + } catch (final NullPointerException e) { + throw new IllegalArgumentException("malformed funding path: " + xmlTree, e); + } + } + + + + @SuppressWarnings("unchecked") + private String getRelFundingTree(final String xmlTree) { + String funding = ""; + try { + final Document ftree = new SAXReader().read(new StringReader(xmlTree)); + funding = ""; + + funding += getFunderElement(ftree); + + for (final Object o : Lists.reverse(ftree.selectNodes("//fundingtree//*[starts-with(local-name(),'funding_level_')]"))) { + final Element e = (Element) o; + final String _id = e.valueOf("./id"); + funding += "<" + e.getName() + " name=\"" + escapeXml(e.valueOf("./name")) + "\">" + escapeXml(_id) + ""; + } + } catch (final DocumentException e) { + throw new IllegalArgumentException("unable to parse funding tree: " + xmlTree + "\n" + e.getMessage()); + } finally { + funding += ""; + } + return funding; + } + + private String getFunderElement(final Document ftree) { + final String funderId = ftree.valueOf("//fundingtree/funder/id/text()"); + final String funderShortName = ftree.valueOf("//fundingtree/funder/shortname/text()"); + final String funderName = ftree.valueOf("//fundingtree/funder/name/text()"); + final String funderJurisdiction = ftree.valueOf("//fundingtree/funder/jurisdiction/text()"); + + return ""; + } + +} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlSerializationUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlSerializationUtils.java new file mode 100644 index 000000000..3088828ab --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlSerializationUtils.java @@ -0,0 +1,151 @@ +package eu.dnetlib.dhp.graph.utils; + +import eu.dnetlib.dhp.schema.oaf.*; + +import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.removePrefix; +import static org.apache.commons.lang3.StringUtils.isBlank; +import static org.apache.commons.lang3.StringUtils.isNotBlank; + +public class XmlSerializationUtils { + + // XML 1.0 + // #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + private final static String xml10pattern = "[^" + + "\u0009\r\n" + + "\u0020-\uD7FF" + + "\uE000-\uFFFD" + + "\ud800\udc00-\udbff\udfff" + + "]"; + + public static String mapJournal(Journal j) { + final String attrs = new StringBuilder() + .append(attr("issn", j.getIssnPrinted())) + .append(attr("eissn", j.getIssnOnline())) + .append(attr("lissn", j.getIssnLinking())) + .append(attr("ep", j.getEp())) + .append(attr("iss", j.getIss())) + .append(attr("sp", j.getSp())) + .append(attr("vol", j.getVol())) + .toString() + .trim(); + + return new StringBuilder() + .append("") + .append(escapeXml(j.getName())) + .append("") + .toString(); + } + + private static String attr(final String name, final String value) { + return isNotBlank(value) ? name + "=\"" + escapeXml(value) + "\" " : ""; + } + + public static String mapStructuredProperty(String name, StructuredProperty t) { + return asXmlElement(name, t.getValue(), t.getQualifier(), t.getDataInfo() != null ? t.getDataInfo() : null); + } + + public static String mapQualifier(String name, Qualifier q) { + return asXmlElement(name, "", q, null); + } + + public static String escapeXml(final String value) { + return value + .replaceAll("&", "&") + .replaceAll("<", "<") + .replaceAll(">", ">") + .replaceAll("\"", """) + .replaceAll("'", "'") + .replaceAll(xml10pattern, ""); + } + + public static String parseDataInfo(final DataInfo dataInfo) { + return new StringBuilder() + .append("") + .append(asXmlElement("inferred", dataInfo.getInferred() + "")) + .append(asXmlElement("deletedbyinference", dataInfo.getDeletedbyinference() + "")) + .append(asXmlElement("trust", dataInfo.getTrust() + "")) + .append(asXmlElement("inferenceprovenance", dataInfo.getInferenceprovenance() + "")) + .append(asXmlElement("provenanceaction", null, dataInfo.getProvenanceaction(), null)) + .append("") + .toString(); + } + + private static StringBuilder dataInfoAsAttributes(final StringBuilder sb, final DataInfo info) { + return sb + .append(attr("inferred", info.getInferred() != null ? info.getInferred().toString() : "")) + .append(attr("inferenceprovenance", info.getInferenceprovenance())) + .append(attr("provenanceaction", info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : "")) + .append(attr("trust", info.getTrust())); + } + + public static String mapKeyValue(final String name, final KeyValue kv) { + return new StringBuilder() + .append("<") + .append(name) + .append(" name=\"") + .append(escapeXml(kv.getValue())) + .append("\" id=\"") + .append(escapeXml(removePrefix(kv.getKey()))) + .append("\"/>") + .toString(); + } + + public static String mapExtraInfo(final ExtraInfo e) { + return new StringBuilder("") + .append(e.getValue()) + .append("") + .toString(); + } + + public static String asXmlElement(final String name, final String value) { + return asXmlElement(name, value, null, null); + } + + public static String asXmlElement(final String name, final String value, final Qualifier q, final DataInfo info) { + StringBuilder sb = new StringBuilder(); + sb.append("<"); + sb.append(name); + if (q != null) { + sb.append(getAttributes(q)); + } + if (info != null) { + sb + .append(" ") + .append(attr("inferred", info.getInferred() != null ? info.getInferred().toString() : "")) + .append(attr("inferenceprovenance", info.getInferenceprovenance())) + .append(attr("provenanceaction", info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : "")) + .append(attr("trust", info.getTrust())); + } + if (isBlank(value)) { + sb.append("/>"); + return sb.toString(); + } + + sb.append(">"); + sb.append(escapeXml(value)); + sb.append(""); + + return sb.toString(); + } + + public static String getAttributes(final Qualifier q) { + if (q == null || q.isBlank()) return ""; + + return new StringBuilder(" ") + .append(attr("classid", q.getClassid())) + .append(attr("classname", q.getClassname())) + .append(attr("schemeid", q.getSchemeid())) + .append(attr("schemename", q.getSchemename())) + .toString(); + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/META-INF/services/javax.xml.transform.TransformerFactory b/dhp-workflows/dhp-graph-provision/src/main/resources/META-INF/services/javax.xml.transform.TransformerFactory new file mode 100644 index 000000000..b53ca855f --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/META-INF/services/javax.xml.transform.TransformerFactory @@ -0,0 +1 @@ +net.sf.saxon.TransformerFactoryImpl \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_params_build_adjacency_lists.json similarity index 65% rename from dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json rename to dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_params_build_adjacency_lists.json index cbd4285bf..e63322028 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_params_build_adjacency_lists.json @@ -1,5 +1,6 @@ [ {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, + {"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true}, {"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the path used to store temporary output files", "paramRequired": true}, - {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true} + {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequence file to read", "paramRequired": true} ] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_params_update_index.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_params_update_index.json new file mode 100644 index 000000000..0d45e9e29 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_params_update_index.json @@ -0,0 +1,7 @@ +[ + {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, + {"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true}, + {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequence file to read the XML records", "paramRequired": true}, + {"paramName":"f", "paramLongName":"format", "paramDescription": "MDFormat name found in the IS profile", "paramRequired": true}, + {"paramName":"b", "paramLongName":"batchSize", "paramDescription": "size of the batch of documents sent to solr", "paramRequired": false} +] diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml index 4b4d2c7bf..fee463868 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml @@ -26,7 +26,15 @@ - + + + + + ${wf:conf('reuseRecords') eq false} + ${wf:conf('reuseRecords') eq true} + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -36,7 +44,7 @@ ${jobTracker} ${nameNode} - yarn-cluster + yarn cluster build_adjacency_lists eu.dnetlib.dhp.graph.SparkXmlRecordBuilderJob @@ -47,14 +55,43 @@ --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" - --conf spark.sql.warehouse.dir="/user/hive/warehouse" --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - -mt yarn-cluster + -mt yarn + -is ${isLookupUrl} --sourcePath${sourcePath} --outputPath${outputPath} + + + + + + + ${jobTracker} + ${nameNode} + yarn + cluster + to_solr_index + eu.dnetlib.dhp.graph.SparkXmlIndexingJob + dhp-graph-provision-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCoresForIndexing} + --driver-memory=${sparkDriverMemory} + --conf spark.executor.instances=${sparkExecutorInstances} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + -mt yarn + -is ${isLookupUrl} + --sourcePath${outputPath}/xml + --format${format} + --batchSize${batchSize} + diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/child.st b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/child.st new file mode 100644 index 000000000..89f81e16b --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/child.st @@ -0,0 +1,3 @@ +> + $metadata:{ it | $it$ }$ + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/entity.st b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/entity.st new file mode 100644 index 000000000..d16f3c3e0 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/entity.st @@ -0,0 +1,10 @@ + + $metadata:{ it | $it$ }$ + + $rels:{ it | $it$ }$ + + + $children:{ it | $it$ }$ + + +$extrainfo:{ it | $it$ }$ \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/instance.st b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/instance.st new file mode 100644 index 000000000..64bed05b4 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/instance.st @@ -0,0 +1,4 @@ + + $metadata:{ it | $it$ }$ + $webresources:{ it | $it$ }$ + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/record.st b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/record.st new file mode 100644 index 000000000..dea68eab8 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/record.st @@ -0,0 +1,17 @@ + + + +
+ $id$ + $dateofcollection$ + $dateoftransformation$ +
+ + + $it$ + + +
+
\ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/rel.st b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/rel.st new file mode 100644 index 000000000..af19ba497 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/rel.st @@ -0,0 +1,4 @@ + + $objIdentifier$ + $metadata:{ it | $it$ }$ + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/webresource.st b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/webresource.st new file mode 100644 index 000000000..7ff6c5d7f --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/webresource.st @@ -0,0 +1,3 @@ + + $identifier$ + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/MappingUtilsTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/MappingUtilsTest.java index fdff4d984..a9d696bea 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/MappingUtilsTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/MappingUtilsTest.java @@ -1,5 +1,8 @@ package eu.dnetlib.dhp.graph; +import eu.dnetlib.dhp.graph.model.EntityRelEntity; +import eu.dnetlib.dhp.graph.model.RelatedEntity; +import eu.dnetlib.dhp.graph.utils.GraphMappingUtils; import org.codehaus.jackson.map.ObjectMapper; import org.junit.Before; import org.junit.Test; @@ -23,23 +26,34 @@ public class MappingUtilsTest { final EntityRelEntity e = new ObjectMapper().readValue(in, EntityRelEntity.class); e.getSource().setType("datasource"); - final EntityRelEntity out = utils.pruneModel(e); + final EntityRelEntity out = utils.asRelatedEntity(e); + System.out.println(out); + + } + + //@Test + public void testOafMappingResult() throws IOException { + + final InputStreamReader in = new InputStreamReader(getClass().getResourceAsStream("result.json")); + final EntityRelEntity e = new ObjectMapper().readValue(in, EntityRelEntity.class); + + final EntityRelEntity out = utils.asRelatedEntity(e); System.out.println(out); } @Test - public void testOafMappinResult() throws IOException { + public void testOafMappingSoftware() throws IOException { - final InputStreamReader in = new InputStreamReader(getClass().getResourceAsStream("result.json")); + final InputStreamReader in = new InputStreamReader(getClass().getResourceAsStream("software.json")); final EntityRelEntity e = new ObjectMapper().readValue(in, EntityRelEntity.class); - e.getSource().setType("otherresearchproduct"); - final EntityRelEntity out = utils.pruneModel(e); + final EntityRelEntity out = utils.asRelatedEntity(e); System.out.println(out); } + @Test public void testParseRelatedEntity() throws IOException { diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/XmlRecordFactoryTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/XmlRecordFactoryTest.java new file mode 100644 index 000000000..2a3c343ec --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/XmlRecordFactoryTest.java @@ -0,0 +1,55 @@ +package eu.dnetlib.dhp.graph; + +import eu.dnetlib.dhp.graph.utils.ContextMapper; +import org.apache.commons.io.FileUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.spark.sql.SparkSession; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +public class XmlRecordFactoryTest { + + private static final Log log = LogFactory.getLog(XmlRecordFactoryTest.class); + + private Path testDir; + + @Before + public void setup() throws IOException { + testDir = Files.createTempDirectory(getClass().getSimpleName()); + log.info("created test directory " + testDir.toString()); + } + + @After + public void tearDown() throws IOException { + FileUtils.deleteDirectory(testDir.toFile()); + log.info("deleted test directory " + testDir.toString()); + } + + @Test + public void testXmlSerialization() throws Exception { + + final SparkSession spark = SparkSession + .builder() + .appName(SparkXmlRecordBuilderJob.class.getSimpleName()) + .master("local[*]") + .getOrCreate(); + + final String inputDir = testDir.toString() + "/3_joined_entities"; + FileUtils.forceMkdir(new File(inputDir)); + FileUtils.copyFile(new File("/Users/claudio/Downloads/joined_entities-part-00000"), new File(inputDir + "/joined_entities-part-00000")); + + final ContextMapper ctx = ContextMapper.fromIS("https://dev-openaire.d4science.org:443/is/services/isLookUp"); + + final GraphJoiner g = new GraphJoiner(spark, ctx, inputDir, testDir.toString()); + + g.asXML(); + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/software.json b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/software.json new file mode 100644 index 000000000..0065b6799 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/software.json @@ -0,0 +1 @@ +{"type":"software","entity":{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0,"id":"50|od______2659::05817f64c43a918a07483340b5726f77","originalId":["oai:zenodo.org:204139"],"collectedfrom":[{"key":"10|opendoar____::358aee4cc897452c00244351e4d91f69","value":"ZENODO","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"pid":[],"extraInfo":[],"author":[],"resulttype":{"classid":"software","classname":"software","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"und","classname":"Undetermined","schemeid":"dnet:languages","schemename":"dnet:languages"},"country":[],"subject":[],"title":[],"relevantdate":[],"description":[],"dateofacceptance":{"value":"2016-01-01","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"refereed":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"context":[],"instance":[{"license":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"instancetype":{"classid":"0029","classname":"Software","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"hostedby":{"key":"10|opendoar____::358aee4cc897452c00244351e4d91f69","value":"ZENODO","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"url":[],"distributionlocation":"","collectedfrom":{"key":"10|opendoar____::358aee4cc897452c00244351e4d91f69","value":"ZENODO","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"dateofacceptance":{"value":"2016-01-01","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}}],"documentationUrl":[],"license":[],"codeRepositoryUrl":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"programmingLanguage":{"classid":"","classname":"","schemeid":"","schemename":""}},"links":[{"relation":{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______2659::05817f64c43a918a07483340b5726f77","target":"40|corda__h2020::e2a38892773e6541ec7c07aa605ad581"},"relatedEntity":{"id":"40|corda__h2020::e2a38892773e6541ec7c07aa605ad581","type":"project","projectTitle":"Engaging the EGI Community towards an Open Science Commons","code":"654142","acronym":"EGI-Engage","contracttype":{},"fundingtree":["{value=ec__________::ECECEuropean CommissionEUec__________::EC::H2020::RIAResearch and Innovation actionRIAec:h2020toasec__________::EC::H2020H2020Horizon 2020 Framework Programmeec:h2020fundings, dataInfo={invisible=false, inferred=false, deletedbyinference=false, trust=, inferenceprovenance=, provenanceaction={classid=, classname=, schemeid=, schemename=}}}"]}},{"relation":{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______2659::05817f64c43a918a07483340b5726f77","target":"40|corda_______::4d31ccb13726266f9098129756e03f43"},"relatedEntity":{"id":"40|corda_______::4d31ccb13726266f9098129756e03f43","type":"project","projectTitle":"Common Operations of Environmental Research Infrastructures","code":"283465","acronym":"ENVRI","contracttype":{},"fundingtree":["{value=ec__________::ECECEuropean CommissionEUec__________::EC::FP7::SP4::INFRAResearch InfrastructuresINFRAec:programec__________::EC::FP7::SP4SP4-CapacitiesSP4ec:specificprogramec__________::EC::FP7SEVENTH FRAMEWORK PROGRAMMEFP7ec:frameworkprogram, dataInfo={invisible=false, inferred=false, deletedbyinference=false, trust=, inferenceprovenance=, provenanceaction={classid=, classname=, schemeid=, schemename=}}}"]}},{"relation":{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______2659::05817f64c43a918a07483340b5726f77","target":"40|corda_______::5af7655a8e0e871cf16072b4b6ab9b41"},"relatedEntity":{"id":"40|corda_______::5af7655a8e0e871cf16072b4b6ab9b41","type":"project","projectTitle":"Data e-Infrastructure Initiative for Fisheries Management and Conservation of Marine Living Resources","code":"283644","acronym":"IMARINE","contracttype":{},"fundingtree":["{value=ec__________::ECECEuropean CommissionEUec__________::EC::FP7::SP4::INFRAResearch InfrastructuresINFRAec:programec__________::EC::FP7::SP4SP4-CapacitiesSP4ec:specificprogramec__________::EC::FP7SEVENTH FRAMEWORK PROGRAMMEFP7ec:frameworkprogram, dataInfo={invisible=false, inferred=false, deletedbyinference=false, trust=, inferenceprovenance=, provenanceaction={classid=, classname=, schemeid=, schemename=}}}"]}},{"relation":{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______2659::05817f64c43a918a07483340b5726f77","target":"40|corda_______::e8da2e3e130ad3b1a650487d9ff126e4"},"relatedEntity":{"id":"40|corda_______::e8da2e3e130ad3b1a650487d9ff126e4","type":"project","projectTitle":"EU-Brazil Open Data and Cloud Computing e-Infrastructure for Biodiversity","code":"288754","acronym":"EUBRAZILOPENBIO","contracttype":{},"fundingtree":["{value=ec__________::ECECEuropean CommissionEUec__________::EC::FP7::SP1::ICTInformation and Communication TechnologiesICTec:programec__________::EC::FP7::SP1SP1-CooperationSP1ec:specificprogramec__________::EC::FP7SEVENTH FRAMEWORK PROGRAMMEFP7ec:frameworkprogram, dataInfo={invisible=false, inferred=false, deletedbyinference=false, trust=, inferenceprovenance=, provenanceaction={classid=, classname=, schemeid=, schemename=}}}"]}},{"relation":{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______2659::05817f64c43a918a07483340b5726f77","target":"40|corda_______::15463ed3cba51f042181197cfabb2ff5"},"relatedEntity":{"id":"40|corda_______::15463ed3cba51f042181197cfabb2ff5","type":"project","projectTitle":"Data Infrastructure Ecosystem for Science","code":"239019","acronym":"D4SCIENCE-II","contracttype":{},"fundingtree":["{value=ec__________::ECECEuropean CommissionEUec__________::EC::FP7::SP4::INFRAResearch InfrastructuresINFRAec:programec__________::EC::FP7::SP4SP4-CapacitiesSP4ec:specificprogramec__________::EC::FP7SEVENTH FRAMEWORK PROGRAMMEFP7ec:frameworkprogram, dataInfo={invisible=false, inferred=false, deletedbyinference=false, trust=, inferenceprovenance=, provenanceaction={classid=, classname=, schemeid=, schemename=}}}"]}},{"relation":{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______2659::05817f64c43a918a07483340b5726f77","target":"40|corda__h2020::4d46893df18bb77f5d817b8ce98ac56c"},"relatedEntity":{"id":"40|corda__h2020::4d46893df18bb77f5d817b8ce98ac56c","type":"project","projectTitle":"Pooling Activities, Resources and Tools for Heritage E-research Networking, Optimization and Synergies","code":"654119","acronym":"PARTHENOS","contracttype":{},"fundingtree":["{value=ec__________::ECECEuropean CommissionEUec__________::EC::H2020::RIAResearch and Innovation actionRIAec:h2020toasec__________::EC::H2020H2020Horizon 2020 Framework Programmeec:h2020fundings, dataInfo={invisible=false, inferred=false, deletedbyinference=false, trust=, inferenceprovenance=, provenanceaction={classid=, classname=, schemeid=, schemename=}}}"]}},{"relation":{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______2659::05817f64c43a918a07483340b5726f77","target":"40|corda_______::7f18b83690e3a18134b9a3db66d882d3"},"relatedEntity":{"id":"40|corda_______::7f18b83690e3a18134b9a3db66d882d3","type":"project","projectTitle":"DIstributed colLaboratories Infrastructure on Grid ENabled Technology 4 Science","code":"212488","acronym":"D4SCIENCE","contracttype":{},"fundingtree":["{value=ec__________::ECECEuropean CommissionEUec__________::EC::FP7::SP4::INFRAResearch InfrastructuresINFRAec:programec__________::EC::FP7::SP4SP4-CapacitiesSP4ec:specificprogramec__________::EC::FP7SEVENTH FRAMEWORK PROGRAMMEFP7ec:frameworkprogram, dataInfo={invisible=false, inferred=false, deletedbyinference=false, trust=, inferenceprovenance=, provenanceaction={classid=, classname=, schemeid=, schemename=}}}"]}},{"relation":{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______2659::05817f64c43a918a07483340b5726f77","target":"40|corda__h2020::6729c0ee95de7724deb60454bb4179de"},"relatedEntity":{"id":"40|corda__h2020::6729c0ee95de7724deb60454bb4179de","type":"project","projectTitle":"Building Research environments for fostering Innovation, Decision making, Governance and Education to support Blue growth","code":"675680","acronym":"BlueBRIDGE","contracttype":{},"fundingtree":["{value=ec__________::ECECEuropean CommissionEUec__________::EC::H2020::RIAResearch and Innovation actionRIAec:h2020toasec__________::EC::H2020H2020Horizon 2020 Framework Programmeec:h2020fundings, dataInfo={invisible=false, inferred=false, deletedbyinference=false, trust=, inferenceprovenance=, provenanceaction={classid=, classname=, schemeid=, schemename=}}}"]}},{"relation":{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______2659::05817f64c43a918a07483340b5726f77","target":"40|corda__h2020::0da81b3ad78047f577dd405e8a2d7f07"},"relatedEntity":{"id":"40|corda__h2020::0da81b3ad78047f577dd405e8a2d7f07","type":"project","projectTitle":"Environmental Research Infrastructures Providing Shared Solutions for Science and Society","code":"654182","acronym":"ENVRI PLUS","contracttype":{},"fundingtree":["{value=ec__________::ECECEuropean CommissionEUec__________::EC::H2020::RIAResearch and Innovation actionRIAec:h2020toasec__________::EC::H2020H2020Horizon 2020 Framework Programmeec:h2020fundings, dataInfo={invisible=false, inferred=false, deletedbyinference=false, trust=, inferenceprovenance=, provenanceaction={classid=, classname=, schemeid=, schemename=}}}"]}},{"relation":{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______2659::05817f64c43a918a07483340b5726f77","target":"40|corda__h2020::e7f5e7755409fc74eea9d168ab795634"},"relatedEntity":{"id":"40|corda__h2020::e7f5e7755409fc74eea9d168ab795634","type":"project","projectTitle":"SoBigData Research Infrastructure","code":"654024","acronym":"SoBigData","contracttype":{},"fundingtree":["{value=ec__________::ECECEuropean CommissionEUec__________::EC::H2020::RIAResearch and Innovation actionRIAec:h2020toasec__________::EC::H2020H2020Horizon 2020 Framework Programmeec:h2020fundings, dataInfo={invisible=false, inferred=false, deletedbyinference=false, trust=, inferenceprovenance=, provenanceaction={classid=, classname=, schemeid=, schemename=}}}"]}}]} \ No newline at end of file diff --git a/pom.xml b/pom.xml index aedf5ebff..f14877500 100644 --- a/pom.xml +++ b/pom.xml @@ -96,6 +96,12 @@ ${dhp.hadoop.version} provided
+ + org.apache.hadoop + hadoop-common + ${dhp.hadoop.version} + provided + org.apache.hadoop hadoop-client @@ -149,7 +155,7 @@ net.sf.saxon Saxon-HE - 9.5.1-5 + 9.9.1-6 @@ -170,6 +176,51 @@ 1.1.6 + + com.mycila.xmltool + xmltool + 3.3 + + + + org.apache.solr + solr-solrj + 7.5.0 + + + * + * + + + + + com.lucidworks.spark + spark-solr + 3.6.0 + + + * + * + + + + + + org.apache.httpcomponents + httpclient + 4.5.3 + + + org.noggit + noggit + 0.8 + + + org.apache.zookeeper + zookeeper + 3.4.11 + + net.schmizz sshj @@ -202,8 +253,17 @@ dnet-pace-core 4.0.0-SNAPSHOT + + eu.dnetlib + cnr-rmi-api + [2.0.0,3.0.0) + - + + org.apache.cxf + cxf-rt-transports-http + 3.1.5 + javax.persistence javax.persistence-api @@ -231,6 +291,11 @@ secondstring 1.0.0 + + org.antlr + stringtemplate + 4.0 + org.apache.oozie From bbf1b611b996a0760307070a619dc3bc510faa2a Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 13 Feb 2020 17:21:11 +0100 Subject: [PATCH 36/45] refereed, processingchargeamount and processingchargecurrency moved inside the Instance element. Introduced specific type to model Result's countries --- .../eu/dnetlib/dhp/schema/oaf/Country.java | 15 +++++ .../eu/dnetlib/dhp/schema/oaf/Instance.java | 30 ++++++++++ .../eu/dnetlib/dhp/schema/oaf/Result.java | 57 ++++--------------- 3 files changed, 55 insertions(+), 47 deletions(-) create mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Country.java diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Country.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Country.java new file mode 100644 index 000000000..e81120e42 --- /dev/null +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Country.java @@ -0,0 +1,15 @@ +package eu.dnetlib.dhp.schema.oaf; + +public class Country extends Qualifier { + + private DataInfo dataInfo; + + public DataInfo getDataInfo() { + return dataInfo; + } + + public void setDataInfo(DataInfo dataInfo) { + this.dataInfo = dataInfo; + } + +} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java index 8f852af65..f82296d8b 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java @@ -22,6 +22,14 @@ public class Instance implements Serializable { private Field dateofacceptance; + // ( article | book ) processing charges. Defined here to cope with possible wrongly typed results + private Field processingchargeamount; + + // currency - alphabetic code describe in ISO-4217. Defined here to cope with possible wrongly typed results + private Field processingchargecurrency; + + private Field refereed; //peer-review status + public Field getLicense() { return license; } @@ -86,7 +94,29 @@ public class Instance implements Serializable { this.dateofacceptance = dateofacceptance; } + public Field getProcessingchargeamount() { + return processingchargeamount; + } + public void setProcessingchargeamount(Field processingchargeamount) { + this.processingchargeamount = processingchargeamount; + } + + public Field getProcessingchargecurrency() { + return processingchargecurrency; + } + + public void setProcessingchargecurrency(Field processingchargecurrency) { + this.processingchargecurrency = processingchargecurrency; + } + + public Field getRefereed() { + return refereed; + } + + public void setRefereed(Field refereed) { + this.refereed = refereed; + } public String toComparableString(){ return String.format("%s::%s::%s::%s", diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java index eb5572ce1..5cb04da5e 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java @@ -1,10 +1,8 @@ package eu.dnetlib.dhp.schema.oaf; -import org.apache.commons.lang3.StringUtils; - import java.io.Serializable; -import java.util.*; -import java.util.stream.Collectors; +import java.util.Comparator; +import java.util.List; public abstract class Result extends OafEntity implements Serializable { @@ -16,7 +14,7 @@ public abstract class Result extends OafEntity implements Serializable { // common fields private Qualifier language; - private List country; + private List country; private List subject; @@ -44,16 +42,10 @@ public abstract class Result extends OafEntity implements Serializable { private List> coverage; - private Field refereed; //peer-review status + private Qualifier bestaccessright; private List context; - // ( article | book ) processing charges. Defined here to cope with possible wrongly typed results - private Field processingchargeamount; - - // currency - alphabetic code describe in ISO-4217. Defined here to cope with possible wrongly typed results - private Field processingchargecurrency; - private List externalReference; private List instance; @@ -82,11 +74,11 @@ public abstract class Result extends OafEntity implements Serializable { this.language = language; } - public List getCountry() { + public List getCountry() { return country; } - public void setCountry(List country) { + public void setCountry(List country) { this.country = country; } @@ -194,12 +186,12 @@ public abstract class Result extends OafEntity implements Serializable { this.coverage = coverage; } - public Field getRefereed() { - return refereed; + public Qualifier getBestaccessright() { + return bestaccessright; } - public void setRefereed(Field refereed) { - this.refereed = refereed; + public void setBestaccessright(Qualifier bestaccessright) { + this.bestaccessright = bestaccessright; } public List getContext() { @@ -226,24 +218,6 @@ public abstract class Result extends OafEntity implements Serializable { this.instance = instance; } - public Field getProcessingchargeamount() { - return processingchargeamount; - } - - public Result setProcessingchargeamount(Field processingchargeamount) { - this.processingchargeamount = processingchargeamount; - return this; - } - - public Field getProcessingchargecurrency() { - return processingchargecurrency; - } - - public Result setProcessingchargecurrency(Field processingchargecurrency) { - this.processingchargecurrency = processingchargecurrency; - return this; - } - @Override public void mergeFrom(OafEntity e) { super.mergeFrom(e); @@ -287,19 +261,9 @@ public abstract class Result extends OafEntity implements Serializable { coverage = mergeLists(coverage, r.getCoverage()); - if (r.getRefereed() != null && compareTrust(this, r) < 0) - refereed = r.getRefereed(); - context = mergeLists(context, r.getContext()); - if (r.getProcessingchargeamount() != null && compareTrust(this, r) < 0) - processingchargeamount = r.getProcessingchargeamount(); - - if (r.getProcessingchargecurrency() != null && compareTrust(this, r) < 0) - processingchargecurrency = r.getProcessingchargecurrency(); - externalReference = mergeLists(externalReference, r.getExternalReference()); - } @@ -314,5 +278,4 @@ public abstract class Result extends OafEntity implements Serializable { return a.size() > b.size() ? a : b; } - } From f9fae97e09ac7645c1c31a230a44286a9c4eda3e Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 13 Feb 2020 18:05:59 +0100 Subject: [PATCH 37/45] test json files aligned with the latest model changes --- .../test/resources/eu/dnetlib/dedup/json/authors_merge.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/json/authors_merge.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/json/authors_merge.json index 4e8b66d1b..090c94c26 100644 --- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/json/authors_merge.json +++ b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/json/authors_merge.json @@ -1,3 +1,3 @@ -{"journal":{"name":"","issnPrinted":"","issnOnline":"","issnLinking":"","ep":"","iss":"","sp":"","vol":"","edition":"","conferenceplace":"","conferencedate":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"author":[{"fullname":"Nemkov, Pavel G.","name":"","surname":"","rank":1,"pid":[],"affiliation":[]},{"fullname":"Gayubo, Severiano F.","name":"","surname":"","rank":2,"pid":[{"value":"ORCID1","qualifier":{"classid":"orcid","classname":"orcid","schemeid":"dnet:pidType","schemename":"dnet:pidType"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"affiliation":[]},{"fullname":"Ciccio Pasticcio","name":"","surname":"","rank":2,"pid":[],"affiliation":[]}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"und","classname":"Undetermined","schemeid":"dent:languages","schemename":"dent:languages"},"country":[],"subject":[{"value":"Biodiversity","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Taxonomy","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Animalia","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Arthropoda","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Insecta","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Hymenoptera","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Crabronidae","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"title":[{"value":"A New Species Of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) From Turkmenistan","qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"relevantdate":[{"value":"2003-12-31","qualifier":{"classid":"dnet:date","classname":"dnet:date","schemeid":"dnet:date","schemename":"dnet:date"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"description":[{"value":"Nemkov, Pavel G., Gayubo, Severiano F. (2003): A new species of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) from Turkmenistan. Zootaxa 144: 1-4, DOI: 10.5281/zenodo.156314","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"Zenodo","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"refereed":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"context":[],"id":"50|scholexplore::ceb3a5d32107897a0df1178211e3e9ca","originalId":[],"collectedfrom":[{"key":"10|openaire____::e034d6a11054f5ade9221ebac484e864","value":"scholExplorer","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"pid":[{"value":"10.5281/zenodo.156314","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"extraInfo":[],"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":true,"inferenceprovenance":"dedup-similarity-result-levenstein","provenanceaction":{"classid":"sysimport:actionset","classname":"sysimport:actionset","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0} -{"journal":{"name":"","issnPrinted":"","issnOnline":"","issnLinking":"","ep":"","iss":"","sp":"","vol":"","edition":"","conferenceplace":"","conferencedate":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"author":[{"fullname":"Nemkov, Pavel G.","name":"","surname":"","rank":1,"pid":[{"value":"ORCIDDIO","qualifier":{"classid":"ORCID","classname":"ORCID","schemeid":"dnet:pidType","schemename":"dnet:pidType"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"affiliation":[]},{"fullname":"Gayubo, Severiano F.","name":"","surname":"","rank":2,"pid":[{"value":"MAGGLES","qualifier":{"classid":"mag","classname":"mag","schemeid":"dnet:pidType","schemename":"dnet:pidType"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"affiliation":[]}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"und","classname":"Undetermined","schemeid":"dent:languages","schemename":"dent:languages"},"country":[],"subject":[{"value":"Biodiversity","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Taxonomy","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Animalia","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Arthropoda","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Insecta","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Hymenoptera","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Crabronidae","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"title":[{"value":"A New Species Of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) From Turkmenistan","qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"relevantdate":[{"value":"2003-12-31","qualifier":{"classid":"dnet:date","classname":"dnet:date","schemeid":"dnet:date","schemename":"dnet:date"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"description":[{"value":"Nemkov, Pavel G., Gayubo, Severiano F. (2003): A new species of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) from Turkmenistan. Zootaxa 144: 1-4, DOI: 10.5281/zenodo.156314","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"Zenodo","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"refereed":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"context":[],"id":"50|scholexplore::ceb3a5d32107897a0df1178211e3e9ca","originalId":[],"collectedfrom":[{"key":"10|openaire____::e034d6a11054f5ade9221ebac484e864","value":"scholExplorer","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"pid":[{"value":"10.5281/zenodo.156314","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"extraInfo":[],"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":true,"inferenceprovenance":"dedup-similarity-result-levenstein","provenanceaction":{"classid":"sysimport:actionset","classname":"sysimport:actionset","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0} -{"journal":{"name":"","issnPrinted":"","issnOnline":"","issnLinking":"","ep":"","iss":"","sp":"","vol":"","edition":"","conferenceplace":"","conferencedate":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"author":[],"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"und","classname":"Undetermined","schemeid":"dent:languages","schemename":"dent:languages"},"country":[],"subject":[{"value":"Biodiversity","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Taxonomy","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Animalia","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Arthropoda","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Insecta","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Hymenoptera","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Crabronidae","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"title":[{"value":"A New Species Of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) From Turkmenistan","qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"relevantdate":[{"value":"2003-12-31","qualifier":{"classid":"dnet:date","classname":"dnet:date","schemeid":"dnet:date","schemename":"dnet:date"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"description":[{"value":"Nemkov, Pavel G., Gayubo, Severiano F. (2003): A new species of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) from Turkmenistan. Zootaxa 144: 1-4, DOI: 10.5281/zenodo.156314","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"Zenodo","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"refereed":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"context":[],"id":"50|scholexplore::ceb3a5d32107897a0df1178211e3e9ca","originalId":[],"collectedfrom":[{"key":"10|openaire____::e034d6a11054f5ade9221ebac484e864","value":"scholExplorer","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"pid":[{"value":"10.5281/zenodo.156314","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"extraInfo":[],"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":true,"inferenceprovenance":"dedup-similarity-result-levenstein","provenanceaction":{"classid":"sysimport:actionset","classname":"sysimport:actionset","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0} +{"journal":{"name":"","issnPrinted":"","issnOnline":"","issnLinking":"","ep":"","iss":"","sp":"","vol":"","edition":"","conferenceplace":"","conferencedate":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"author":[{"fullname":"Nemkov, Pavel G.","name":"","surname":"","rank":1,"pid":[],"affiliation":[]},{"fullname":"Gayubo, Severiano F.","name":"","surname":"","rank":2,"pid":[{"value":"ORCID1","qualifier":{"classid":"orcid","classname":"orcid","schemeid":"dnet:pidType","schemename":"dnet:pidType"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"affiliation":[]},{"fullname":"Ciccio Pasticcio","name":"","surname":"","rank":2,"pid":[],"affiliation":[]}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"und","classname":"Undetermined","schemeid":"dent:languages","schemename":"dent:languages"},"country":[],"subject":[{"value":"Biodiversity","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Taxonomy","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Animalia","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Arthropoda","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Insecta","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Hymenoptera","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Crabronidae","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"title":[{"value":"A New Species Of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) From Turkmenistan","qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"relevantdate":[{"value":"2003-12-31","qualifier":{"classid":"dnet:date","classname":"dnet:date","schemeid":"dnet:date","schemename":"dnet:date"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"description":[{"value":"Nemkov, Pavel G., Gayubo, Severiano F. (2003): A new species of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) from Turkmenistan. Zootaxa 144: 1-4, DOI: 10.5281/zenodo.156314","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"Zenodo","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"context":[],"id":"50|scholexplore::ceb3a5d32107897a0df1178211e3e9ca","originalId":[],"collectedfrom":[{"key":"10|openaire____::e034d6a11054f5ade9221ebac484e864","value":"scholExplorer","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"pid":[{"value":"10.5281/zenodo.156314","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"extraInfo":[],"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":true,"inferenceprovenance":"dedup-similarity-result-levenstein","provenanceaction":{"classid":"sysimport:actionset","classname":"sysimport:actionset","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0} +{"journal":{"name":"","issnPrinted":"","issnOnline":"","issnLinking":"","ep":"","iss":"","sp":"","vol":"","edition":"","conferenceplace":"","conferencedate":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"author":[{"fullname":"Nemkov, Pavel G.","name":"","surname":"","rank":1,"pid":[{"value":"ORCIDDIO","qualifier":{"classid":"ORCID","classname":"ORCID","schemeid":"dnet:pidType","schemename":"dnet:pidType"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"affiliation":[]},{"fullname":"Gayubo, Severiano F.","name":"","surname":"","rank":2,"pid":[{"value":"MAGGLES","qualifier":{"classid":"mag","classname":"mag","schemeid":"dnet:pidType","schemename":"dnet:pidType"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"affiliation":[]}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"und","classname":"Undetermined","schemeid":"dent:languages","schemename":"dent:languages"},"country":[],"subject":[{"value":"Biodiversity","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Taxonomy","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Animalia","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Arthropoda","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Insecta","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Hymenoptera","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Crabronidae","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"title":[{"value":"A New Species Of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) From Turkmenistan","qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"relevantdate":[{"value":"2003-12-31","qualifier":{"classid":"dnet:date","classname":"dnet:date","schemeid":"dnet:date","schemename":"dnet:date"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"description":[{"value":"Nemkov, Pavel G., Gayubo, Severiano F. (2003): A new species of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) from Turkmenistan. Zootaxa 144: 1-4, DOI: 10.5281/zenodo.156314","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"Zenodo","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"context":[],"id":"50|scholexplore::ceb3a5d32107897a0df1178211e3e9ca","originalId":[],"collectedfrom":[{"key":"10|openaire____::e034d6a11054f5ade9221ebac484e864","value":"scholExplorer","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"pid":[{"value":"10.5281/zenodo.156314","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"extraInfo":[],"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":true,"inferenceprovenance":"dedup-similarity-result-levenstein","provenanceaction":{"classid":"sysimport:actionset","classname":"sysimport:actionset","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0} +{"journal":{"name":"","issnPrinted":"","issnOnline":"","issnLinking":"","ep":"","iss":"","sp":"","vol":"","edition":"","conferenceplace":"","conferencedate":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"author":[],"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"und","classname":"Undetermined","schemeid":"dent:languages","schemename":"dent:languages"},"country":[],"subject":[{"value":"Biodiversity","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Taxonomy","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Animalia","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Arthropoda","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Insecta","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Hymenoptera","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Crabronidae","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"title":[{"value":"A New Species Of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) From Turkmenistan","qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"relevantdate":[{"value":"2003-12-31","qualifier":{"classid":"dnet:date","classname":"dnet:date","schemeid":"dnet:date","schemename":"dnet:date"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"description":[{"value":"Nemkov, Pavel G., Gayubo, Severiano F. (2003): A new species of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) from Turkmenistan. Zootaxa 144: 1-4, DOI: 10.5281/zenodo.156314","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"Zenodo","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"context":[],"id":"50|scholexplore::ceb3a5d32107897a0df1178211e3e9ca","originalId":[],"collectedfrom":[{"key":"10|openaire____::e034d6a11054f5ade9221ebac484e864","value":"scholExplorer","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"pid":[{"value":"10.5281/zenodo.156314","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"extraInfo":[],"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":true,"inferenceprovenance":"dedup-similarity-result-levenstein","provenanceaction":{"classid":"sysimport:actionset","classname":"sysimport:actionset","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0} From 49e648f7c3f21d051a90a9c167b057c4711b68f4 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 13 Feb 2020 18:09:31 +0100 Subject: [PATCH 38/45] bumped version --- dhp-build/dhp-build-assembly-resources/pom.xml | 2 +- dhp-build/dhp-build-properties-maven-plugin/pom.xml | 2 +- dhp-build/pom.xml | 2 +- dhp-common/pom.xml | 2 +- dhp-schemas/pom.xml | 2 +- dhp-workflows/dhp-aggregation/pom.xml | 2 +- dhp-workflows/dhp-dedup/pom.xml | 5 ++--- dhp-workflows/dhp-distcp/pom.xml | 2 +- dhp-workflows/dhp-graph-mapper/pom.xml | 2 +- dhp-workflows/pom.xml | 2 +- pom.xml | 2 +- 11 files changed, 12 insertions(+), 13 deletions(-) diff --git a/dhp-build/dhp-build-assembly-resources/pom.xml b/dhp-build/dhp-build-assembly-resources/pom.xml index 834af77fa..60e55eeef 100644 --- a/dhp-build/dhp-build-assembly-resources/pom.xml +++ b/dhp-build/dhp-build-assembly-resources/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp-build - 1.0.5-SNAPSHOT + 1.1.5-SNAPSHOT dhp-build-assembly-resources diff --git a/dhp-build/dhp-build-properties-maven-plugin/pom.xml b/dhp-build/dhp-build-properties-maven-plugin/pom.xml index 4f99d5298..ba56bc484 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/pom.xml +++ b/dhp-build/dhp-build-properties-maven-plugin/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp-build - 1.0.5-SNAPSHOT + 1.1.5-SNAPSHOT dhp-build-properties-maven-plugin diff --git a/dhp-build/pom.xml b/dhp-build/pom.xml index e0b20204c..8d116498a 100644 --- a/dhp-build/pom.xml +++ b/dhp-build/pom.xml @@ -4,7 +4,7 @@ eu.dnetlib.dhp dhp - 1.0.5-SNAPSHOT + 1.1.5-SNAPSHOT dhp-build pom diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index 43c2a3834..7e160d082 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib.dhp dhp - 1.0.5-SNAPSHOT + 1.1.5-SNAPSHOT ../ diff --git a/dhp-schemas/pom.xml b/dhp-schemas/pom.xml index 20896a61d..9b5a6f17c 100644 --- a/dhp-schemas/pom.xml +++ b/dhp-schemas/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib.dhp dhp - 1.0.5-SNAPSHOT + 1.1.5-SNAPSHOT ../ diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index 328e783c4..6a5073f22 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -4,7 +4,7 @@ eu.dnetlib.dhp dhp-workflows - 1.0.5-SNAPSHOT + 1.1.5-SNAPSHOT dhp-aggregation diff --git a/dhp-workflows/dhp-dedup/pom.xml b/dhp-workflows/dhp-dedup/pom.xml index 28ef6a453..3d43fbec4 100644 --- a/dhp-workflows/dhp-dedup/pom.xml +++ b/dhp-workflows/dhp-dedup/pom.xml @@ -1,10 +1,9 @@ - + dhp-workflows eu.dnetlib.dhp - 1.0.5-SNAPSHOT + 1.1.5-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-distcp/pom.xml b/dhp-workflows/dhp-distcp/pom.xml index 3b2f3176b..d971a0747 100644 --- a/dhp-workflows/dhp-distcp/pom.xml +++ b/dhp-workflows/dhp-distcp/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.0.5-SNAPSHOT + 1.1.5-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml index 9186fa829..3ac1b7994 100644 --- a/dhp-workflows/dhp-graph-mapper/pom.xml +++ b/dhp-workflows/dhp-graph-mapper/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.0.5-SNAPSHOT + 1.1.5-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index cf71190a4..f16362bff 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp - 1.0.5-SNAPSHOT + 1.1.5-SNAPSHOT ../ diff --git a/pom.xml b/pom.xml index a1db6894c..efef04eeb 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 eu.dnetlib.dhp dhp - 1.0.5-SNAPSHOT + 1.1.5-SNAPSHOT pom http://www.d-net.research-infrastructures.eu From 6ed9a15bc8319f1d19453f49119bd19a29defb46 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 13 Feb 2020 18:11:31 +0100 Subject: [PATCH 39/45] [maven-release-plugin] prepare release dhp-1.1.5 --- dhp-build/dhp-build-assembly-resources/pom.xml | 2 +- dhp-build/dhp-build-properties-maven-plugin/pom.xml | 2 +- dhp-build/pom.xml | 2 +- dhp-common/pom.xml | 2 +- dhp-schemas/pom.xml | 2 +- dhp-workflows/dhp-aggregation/pom.xml | 2 +- dhp-workflows/dhp-dedup/pom.xml | 2 +- dhp-workflows/dhp-distcp/pom.xml | 2 +- dhp-workflows/dhp-graph-mapper/pom.xml | 2 +- dhp-workflows/pom.xml | 2 +- pom.xml | 4 ++-- 11 files changed, 12 insertions(+), 12 deletions(-) diff --git a/dhp-build/dhp-build-assembly-resources/pom.xml b/dhp-build/dhp-build-assembly-resources/pom.xml index 60e55eeef..a71ffaa30 100644 --- a/dhp-build/dhp-build-assembly-resources/pom.xml +++ b/dhp-build/dhp-build-assembly-resources/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp-build - 1.1.5-SNAPSHOT + 1.1.5 dhp-build-assembly-resources diff --git a/dhp-build/dhp-build-properties-maven-plugin/pom.xml b/dhp-build/dhp-build-properties-maven-plugin/pom.xml index ba56bc484..76161d455 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/pom.xml +++ b/dhp-build/dhp-build-properties-maven-plugin/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp-build - 1.1.5-SNAPSHOT + 1.1.5 dhp-build-properties-maven-plugin diff --git a/dhp-build/pom.xml b/dhp-build/pom.xml index 8d116498a..082a06333 100644 --- a/dhp-build/pom.xml +++ b/dhp-build/pom.xml @@ -4,7 +4,7 @@ eu.dnetlib.dhp dhp - 1.1.5-SNAPSHOT + 1.1.5 dhp-build pom diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index 7e160d082..4c24a9607 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib.dhp dhp - 1.1.5-SNAPSHOT + 1.1.5 ../ diff --git a/dhp-schemas/pom.xml b/dhp-schemas/pom.xml index 9b5a6f17c..fd3a5bbb0 100644 --- a/dhp-schemas/pom.xml +++ b/dhp-schemas/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib.dhp dhp - 1.1.5-SNAPSHOT + 1.1.5 ../ diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index 6a5073f22..650a6a5a4 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -4,7 +4,7 @@ eu.dnetlib.dhp dhp-workflows - 1.1.5-SNAPSHOT + 1.1.5 dhp-aggregation diff --git a/dhp-workflows/dhp-dedup/pom.xml b/dhp-workflows/dhp-dedup/pom.xml index 3d43fbec4..96cc43e96 100644 --- a/dhp-workflows/dhp-dedup/pom.xml +++ b/dhp-workflows/dhp-dedup/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.1.5-SNAPSHOT + 1.1.5 4.0.0 diff --git a/dhp-workflows/dhp-distcp/pom.xml b/dhp-workflows/dhp-distcp/pom.xml index d971a0747..b470fbcac 100644 --- a/dhp-workflows/dhp-distcp/pom.xml +++ b/dhp-workflows/dhp-distcp/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.1.5-SNAPSHOT + 1.1.5 4.0.0 diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml index 3ac1b7994..0b13f3521 100644 --- a/dhp-workflows/dhp-graph-mapper/pom.xml +++ b/dhp-workflows/dhp-graph-mapper/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.1.5-SNAPSHOT + 1.1.5 4.0.0 diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index f16362bff..24555fdd0 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp - 1.1.5-SNAPSHOT + 1.1.5 ../ diff --git a/pom.xml b/pom.xml index efef04eeb..f7dfcbeda 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 eu.dnetlib.dhp dhp - 1.1.5-SNAPSHOT + 1.1.5 pom http://www.d-net.research-infrastructures.eu @@ -38,7 +38,7 @@ scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git https://code-repo.d4science.org/D-Net/dnet-hadoop/ - HEAD + dhp-1.1.5 From a3d0b57b25943f54c06e28d9871ce0f7d785ce4a Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 13 Feb 2020 18:11:33 +0100 Subject: [PATCH 40/45] [maven-release-plugin] prepare for next development iteration --- dhp-build/dhp-build-assembly-resources/pom.xml | 2 +- dhp-build/dhp-build-properties-maven-plugin/pom.xml | 2 +- dhp-build/pom.xml | 2 +- dhp-common/pom.xml | 2 +- dhp-schemas/pom.xml | 2 +- dhp-workflows/dhp-aggregation/pom.xml | 2 +- dhp-workflows/dhp-dedup/pom.xml | 2 +- dhp-workflows/dhp-distcp/pom.xml | 2 +- dhp-workflows/dhp-graph-mapper/pom.xml | 2 +- dhp-workflows/pom.xml | 2 +- pom.xml | 4 ++-- 11 files changed, 12 insertions(+), 12 deletions(-) diff --git a/dhp-build/dhp-build-assembly-resources/pom.xml b/dhp-build/dhp-build-assembly-resources/pom.xml index a71ffaa30..0c4637def 100644 --- a/dhp-build/dhp-build-assembly-resources/pom.xml +++ b/dhp-build/dhp-build-assembly-resources/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp-build - 1.1.5 + 1.1.6-SNAPSHOT dhp-build-assembly-resources diff --git a/dhp-build/dhp-build-properties-maven-plugin/pom.xml b/dhp-build/dhp-build-properties-maven-plugin/pom.xml index 76161d455..9a2d0ffa0 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/pom.xml +++ b/dhp-build/dhp-build-properties-maven-plugin/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp-build - 1.1.5 + 1.1.6-SNAPSHOT dhp-build-properties-maven-plugin diff --git a/dhp-build/pom.xml b/dhp-build/pom.xml index 082a06333..e471af76d 100644 --- a/dhp-build/pom.xml +++ b/dhp-build/pom.xml @@ -4,7 +4,7 @@ eu.dnetlib.dhp dhp - 1.1.5 + 1.1.6-SNAPSHOT dhp-build pom diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index 4c24a9607..b736b22d8 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib.dhp dhp - 1.1.5 + 1.1.6-SNAPSHOT ../ diff --git a/dhp-schemas/pom.xml b/dhp-schemas/pom.xml index fd3a5bbb0..bea9489f9 100644 --- a/dhp-schemas/pom.xml +++ b/dhp-schemas/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib.dhp dhp - 1.1.5 + 1.1.6-SNAPSHOT ../ diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index 650a6a5a4..a1db4ad2e 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -4,7 +4,7 @@ eu.dnetlib.dhp dhp-workflows - 1.1.5 + 1.1.6-SNAPSHOT dhp-aggregation diff --git a/dhp-workflows/dhp-dedup/pom.xml b/dhp-workflows/dhp-dedup/pom.xml index 96cc43e96..6aef8f313 100644 --- a/dhp-workflows/dhp-dedup/pom.xml +++ b/dhp-workflows/dhp-dedup/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.1.5 + 1.1.6-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-distcp/pom.xml b/dhp-workflows/dhp-distcp/pom.xml index b470fbcac..c3f09b42c 100644 --- a/dhp-workflows/dhp-distcp/pom.xml +++ b/dhp-workflows/dhp-distcp/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.1.5 + 1.1.6-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml index 0b13f3521..802c3ff21 100644 --- a/dhp-workflows/dhp-graph-mapper/pom.xml +++ b/dhp-workflows/dhp-graph-mapper/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.1.5 + 1.1.6-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index 24555fdd0..59f06bdc3 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp - 1.1.5 + 1.1.6-SNAPSHOT ../ diff --git a/pom.xml b/pom.xml index f7dfcbeda..300af6a61 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 eu.dnetlib.dhp dhp - 1.1.5 + 1.1.6-SNAPSHOT pom http://www.d-net.research-infrastructures.eu @@ -38,7 +38,7 @@ scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git https://code-repo.d4science.org/D-Net/dnet-hadoop/ - dhp-1.1.5 + HEAD From 56d1810a66063b886a59e501790e064e6ac26750 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 14 Feb 2020 12:28:52 +0100 Subject: [PATCH 41/45] working procedure for records indexing using Spark, via lib com.lucidworks.spark:spark-solr --- .../eu/dnetlib/dhp/utils/saxon/PickFirst.java | 19 ++++++++++----- .../job-override.properties | 7 +++--- dhp-workflows/dhp-graph-provision/pom.xml | 7 +++++- .../dhp/graph/SparkXmlIndexingJob.java | 2 +- .../dhp/graph/utils/XmlRecordFactory.java | 24 +++++++++---------- .../dnetlib/dhp/graph/oozie_app/workflow.xml | 3 +-- dhp-workflows/pom.xml | 1 + pom.xml | 5 ++++ 8 files changed, 42 insertions(+), 26 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java index 1f209bed0..a221e37c6 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.utils.saxon; import net.sf.saxon.expr.XPathContext; +import net.sf.saxon.om.Item; import net.sf.saxon.om.Sequence; import net.sf.saxon.trans.XPathException; import net.sf.saxon.value.SequenceType; @@ -19,15 +20,21 @@ public class PickFirst extends AbstractExtensionFunction { if (arguments == null | arguments.length == 0) { return new StringValue(""); } - String s1 = arguments[0].head().getStringValue(); - if (arguments.length > 1) { - String s2 = arguments[1].head().getStringValue(); + final String s1 = getValue(arguments[0]); + final String s2 = getValue(arguments[1]); - return new StringValue(StringUtils.isNotBlank(s1) ? s1 : StringUtils.isNotBlank(s2) ? s2 : ""); - } else { - return new StringValue(StringUtils.isNotBlank(s1) ? s1 : ""); + return new StringValue(StringUtils.isNotBlank(s1) ? s1 : StringUtils.isNotBlank(s2) ? s2 : ""); + } + + private String getValue(final Sequence arg) throws XPathException { + if (arg != null) { + final Item item = arg.head(); + if (item != null) { + return item.getStringValue(); + } } + return ""; } @Override diff --git a/dhp-workflows/dhp-graph-provision/job-override.properties b/dhp-workflows/dhp-graph-provision/job-override.properties index c7b173a14..b5ab07982 100644 --- a/dhp-workflows/dhp-graph-provision/job-override.properties +++ b/dhp-workflows/dhp-graph-provision/job-override.properties @@ -5,7 +5,6 @@ isLookupUrl=http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03 outputPath=/tmp/openaire_provision format=TMF -batchSize=1000 -sparkExecutorCoresForIndexing=1 -sparkExecutorInstances=10 -reuseRecords=false \ No newline at end of file +batchSize=2000 +sparkExecutorCoresForIndexing=64 +reuseRecords=true \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml index 5e6beb249..f74c9b666 100644 --- a/dhp-workflows/dhp-graph-provision/pom.xml +++ b/dhp-workflows/dhp-graph-provision/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.0.5-SNAPSHOT + 1.1.6-SNAPSHOT 4.0.0 @@ -52,6 +52,11 @@ org.apache.httpcomponents httpclient + + org.apache.httpcomponents + httpmime + + org.noggit noggit diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlIndexingJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlIndexingJob.java index e13f8bbe2..2775d93b4 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlIndexingJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlIndexingJob.java @@ -45,7 +45,7 @@ public class SparkXmlIndexingJob { final String inputPath = parser.get("sourcePath"); final String isLookupUrl = parser.get("isLookupUrl"); final String format = parser.get("format"); - final Integer batchSize = parser.getObjectMap().containsKey("batckSize") ? Integer.valueOf(parser.get("batchSize")) : DEFAULT_BATCH_SIZE; + final Integer batchSize = parser.getObjectMap().containsKey("batchSize") ? Integer.valueOf(parser.get("batchSize")) : DEFAULT_BATCH_SIZE; final ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl); final String fields = getLayoutSource(isLookup, format); diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlRecordFactory.java index bd4f8ec6c..abcf2a7ec 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlRecordFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlRecordFactory.java @@ -129,6 +129,9 @@ public class XmlRecordFactory implements Serializable { .map(t -> mapStructuredProperty("title", t)) .collect(Collectors.toList())); } + if (r.getBestaccessright() != null) { + metadata.add(mapQualifier("bestaccessright", r.getBestaccessright())); + } if (r.getAuthor() != null) { metadata.addAll(r.getAuthor() .stream() @@ -230,15 +233,6 @@ public class XmlRecordFactory implements Serializable { if (r.getResourcetype() != null) { metadata.add(mapQualifier("resourcetype", r.getResourcetype())); } - if (r.getRefereed() != null) { - metadata.add(asXmlElement("refereed", r.getRefereed().getValue())); - } - if (r.getProcessingchargeamount() != null) { - metadata.add(asXmlElement("processingchargeamount", r.getProcessingchargeamount().getValue())); - } - if (r.getProcessingchargecurrency() != null) { - metadata.add(asXmlElement("processingchargecurrency", r.getProcessingchargecurrency().getValue())); - } metadata.add(mapQualifier("bestaccessright", getBestAccessright(r))); @@ -544,9 +538,6 @@ public class XmlRecordFactory implements Serializable { if (p.getDuration() != null) { metadata.add(asXmlElement("duration", p.getDuration().getValue())); } - if (p.getEcsc39() != null) { - metadata.add(asXmlElement("ecsc39", p.getEcsc39().getValue())); - } if (p.getEcarticle29_3() != null) { metadata.add(asXmlElement("ecarticle29_3", p.getEcarticle29_3().getValue())); } @@ -759,6 +750,15 @@ public class XmlRecordFactory implements Serializable { if (isNotBlank(instance.getDistributionlocation())) { fields.add(asXmlElement("distributionlocation", instance.getDistributionlocation())); } + if (instance.getRefereed() != null && isNotBlank(instance.getRefereed().getValue())) { + fields.add(asXmlElement("refereed", instance.getRefereed().getValue())); + } + if (instance.getProcessingchargeamount() != null && isNotBlank(instance.getProcessingchargeamount().getValue())) { + fields.add(asXmlElement("processingchargeamount", instance.getProcessingchargeamount().getValue())); + } + if (instance.getProcessingchargecurrency() != null && isNotBlank(instance.getProcessingchargecurrency().getValue())) { + fields.add(asXmlElement("processingchargecurrency", instance.getProcessingchargecurrency().getValue())); + } children.add(templateFactory.getInstance(instance.getHostedby().getKey(), fields, instance.getUrl())); } diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml index fee463868..350358944 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml @@ -78,9 +78,8 @@ dhp-graph-provision-${projectVersion}.jar --executor-memory ${sparkExecutorMemory} - --executor-cores ${sparkExecutorCoresForIndexing} --driver-memory=${sparkDriverMemory} - --conf spark.executor.instances=${sparkExecutorInstances} + --conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForIndexing} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index 59f06bdc3..05bfe677d 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -18,6 +18,7 @@ dhp-distcp dhp-graph-mapper dhp-dedup + dhp-graph-provision diff --git a/pom.xml b/pom.xml index 4e12ba1a9..8b01741d6 100644 --- a/pom.xml +++ b/pom.xml @@ -210,6 +210,11 @@ httpclient 4.5.3 + + org.apache.httpcomponents + httpmime + 4.5.3 + org.noggit noggit From c460e2d28126535ab085c104fbce3997eb781a2b Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 17 Feb 2020 11:54:48 +0100 Subject: [PATCH 42/45] Aggiornare 'dhp-workflows/docs/oozie-installer.markdown' --- dhp-workflows/docs/oozie-installer.markdown | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/docs/oozie-installer.markdown b/dhp-workflows/docs/oozie-installer.markdown index b1953a54e..d2de80dcc 100644 --- a/dhp-workflows/docs/oozie-installer.markdown +++ b/dhp-workflows/docs/oozie-installer.markdown @@ -10,9 +10,8 @@ This module is automatically executed when running: on module having set: - eu.dnetlib - dhp-wf - 1.0.0-SNAPSHOT + eu.dnetlib.dhp + dhp-workflows in `pom.xml` file. `oozie-package` profile initializes oozie workflow packaging, `workflow.source.dir` property points to a workflow (notice: this is not a relative path but a classpath to directory usually holding `oozie_app` subdirectory). From 76ee85141a10fead8a67c807416d6cb3b88a167d Mon Sep 17 00:00:00 2001 From: "sandro.labruzzo" Date: Mon, 17 Feb 2020 12:31:44 +0100 Subject: [PATCH 43/45] added oozie job for DNET migration and implemented Spark job for extracting entities --- .../migration/ExtractEntitiesFromHDFSJob.java | 56 ++++ ...extract_entities_from_hdfs_parameters.json | 26 ++ .../migration/oozie_app/config-default.xml | 22 ++ .../dhp/migration/oozie_app/workflow.xml | 282 ++++++++++++++++++ .../dnetlib/dhp/graph/GraphMappingUtils.java | 12 +- pom.xml | 2 +- 6 files changed, 393 insertions(+), 7 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/ExtractEntitiesFromHDFSJob.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/extract_entities_from_hdfs_parameters.json create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/ExtractEntitiesFromHDFSJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/ExtractEntitiesFromHDFSJob.java new file mode 100644 index 000000000..f2d9caebf --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/ExtractEntitiesFromHDFSJob.java @@ -0,0 +1,56 @@ +package eu.dnetlib.dhp.migration; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.io.Text; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; +import scala.Tuple2; + +import java.util.Arrays; +import java.util.List; + +public class ExtractEntitiesFromHDFSJob { + + + private static List folderNames = Arrays.asList("db_entities", "oaf_entities", "odf_entities"); + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/extract_entities_from_hdfs_parameters.json"))); + parser.parseArgument(args); + + final SparkSession spark = SparkSession + .builder() + .appName(ExtractEntitiesFromHDFSJob.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); + + final String sourcePath = parser.get("sourcePath"); + final String targetPath = parser.get("graphRawPath"); + final String entity = parser.get("entity"); + + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + + JavaRDD inputRdd = sc.emptyRDD(); + + + folderNames.forEach(p -> inputRdd.union( + sc.sequenceFile(sourcePath+"/"+p, Text.class, Text.class) + .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) + .filter(k -> isEntityType(k._1(), entity)) + .map(Tuple2::_2)) + ); + + inputRdd.saveAsTextFile(targetPath+"/"+entity); + } + + + private static boolean isEntityType(final String item, final String entity) { + return StringUtils.substringAfter(item, ":").equalsIgnoreCase(entity); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/extract_entities_from_hdfs_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/extract_entities_from_hdfs_parameters.json new file mode 100644 index 000000000..f179ee0f8 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/extract_entities_from_hdfs_parameters.json @@ -0,0 +1,26 @@ +[ + { + "paramName": "s", + "paramLongName": "sourcePath", + "paramDescription": "the HDFS source path which contains the sequential file", + "paramRequired": true + }, + { + "paramName": "mt", + "paramLongName": "master", + "paramDescription": "should be local or yarn", + "paramRequired": true + }, + { + "paramName": "g", + "paramLongName": "graphRawPath", + "paramDescription": "the path of the graph Raw in hdfs", + "paramRequired": true + }, + { + "paramName": "e", + "paramLongName": "entity", + "paramDescription": "The entity to extract", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/config-default.xml new file mode 100644 index 000000000..51e48d8f7 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/config-default.xml @@ -0,0 +1,22 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hdfsUser + dnet + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml new file mode 100644 index 000000000..309a6d90f --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml @@ -0,0 +1,282 @@ + + + + workingPath + the base path to store hdfs file + + + graphRawPath + the graph Raw base path + + + + postgresURL + the postgres URL to access to the database + + + postgresUser + the user postgres + + + postgresPassword + the password postgres + + + mongourl + mongoDB url, example: mongodb://[username:password@]host[:port] + + + mongoDb + mongo database + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.migration.MigrateDbEntitiesApplication + -p${workingPath}/db_entities + -n${nameNode} + -u${hdfsUser} + -dburl${postgresURL} + -dbuser${postgresUser} + -dbpasswd${postgresPassword} + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.migration.MigrateMongoMdstoresApplication + -p${workingPath}/odf_entities + -n${nameNode} + -u${hdfsUser} + -mongourl${mongourl} + -db${mongoDb} + -fODF + -lstore + -icleaned + -pgurl${postgresURL} + -pguser${postgresUser} + -pgpasswd${postgresPassword} + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.migration.MigrateMongoMdstoresApplication + -p${workingPath}/oaf_entities + -n${nameNode} + -u${hdfsUser} + -mongourl${mongourl} + -db${mongoDb} + -fOAF + -lstore + -icleaned + -pgurl${postgresURL} + -pguser${postgresUser} + -pgpasswd${postgresPassword} + + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + ExtractEntities: publication + eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob + dhp-aggregation-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" + -mt yarn-cluster + --sourcePath${workingPath} + -g${graphRawPath}/publication + -epublication + + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + ExtractEntities: dataset + eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob + dhp-aggregation-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" + -mt yarn-cluster + --sourcePath${workingPath} + -g${graphRawPath}/dataset + -edataset + + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + ExtractEntities: software + eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob + dhp-aggregation-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" + -mt yarn-cluster + --sourcePath${workingPath} + -g${graphRawPath}/software + -esoftware + + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + ExtractEntities: otherresearchproduct + eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob + dhp-aggregation-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" + -mt yarn-cluster + --sourcePath${workingPath} + -g${graphRawPath}/otherresearchproduct + -eotherresearchproduct + + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + ExtractEntities: datasource + eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob + dhp-aggregation-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" + -mt yarn-cluster + --sourcePath${workingPath} + -g${graphRawPath}/datasource + -edatasource + + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + ExtractEntities: organization + eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob + dhp-aggregation-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" + -mt yarn-cluster + --sourcePath${workingPath} + -g${graphRawPath}/organization + -eorganization + + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + ExtractEntities: project + eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob + dhp-aggregation-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" + -mt yarn-cluster + --sourcePath${workingPath} + -g${graphRawPath}/project + -eproject + + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + ExtractEntities: relation + eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob + dhp-aggregation-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" + -mt yarn-cluster + --sourcePath${workingPath} + -g${graphRawPath}/relation + -erelation + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java index 0291be47e..7c0967b2e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java @@ -18,13 +18,13 @@ public class GraphMappingUtils { public final static Map types = Maps.newHashMap(); static { - types.put("datasource", Datasource.class); - types.put("organization", Organization.class); + types.put("datasource", Datasource.class); + types.put("organization", Organization.class); types.put("project", Project.class); - types.put("dataset", Dataset.class); - types.put("otherresearchproduct", OtherResearchProduct.class); - types.put("software", Software.class); - types.put("publication", Publication.class); + types.put("dataset", Dataset.class); + types.put("otherresearchproduct", OtherResearchProduct.class); + types.put("software", Software.class); + types.put("publication", Publication.class); types.put("relation", Relation.class); } diff --git a/pom.xml b/pom.xml index 6f85886c0..658d8285f 100644 --- a/pom.xml +++ b/pom.xml @@ -202,7 +202,7 @@ eu.dnetlib dnet-pace-core - 4.0.0-SNAPSHOT + 4.0.0 From 5bae30f399453d04bff8c48bd4a549748fcdda24 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 17 Feb 2020 13:38:33 +0100 Subject: [PATCH 44/45] adding readme for dhp-schema --- dhp-schemas/README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/dhp-schemas/README.md b/dhp-schemas/README.md index 473ad4cf1..7431cda42 100644 --- a/dhp-schemas/README.md +++ b/dhp-schemas/README.md @@ -1,3 +1,11 @@ Description of the project -------------------------- -This project defines **serialization schemas** of Avro data store files that are used to pass data between workflow nodes in the system. +This project defines **object schemas** of the OpenAIRE main entities and the relationships that intercur among them. +Namely it defines the model for + +- **research product (result)** which subclasses in publication, dataset, other research product, software +- **data source** object describing the data provider (institutional repository, aggregators, cris systems) +- **organization** research bodies managing a data source or participating to a research project +- **project** research project + +Te serialization of such objects (data store files) are used to pass data between workflow nodes in the processing pipeline. From 6a288625e58eb7252ed7dbfd16aa0ae709168438 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 17 Feb 2020 15:04:33 +0100 Subject: [PATCH 45/45] fixed workflow outgoing node --- .../resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml index 309a6d90f..dd6998db0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml @@ -113,7 +113,7 @@ -pguser${postgresUser} -pgpasswd${postgresPassword} - +