From 3adedd0a681e8a352fe96e850eed78ad227b210a Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Fri, 17 Jul 2020 11:58:11 +0200 Subject: [PATCH 01/17] trust truncated to 3 decimals --- .../dhp/oa/graph/raw/MigrateDbEntitiesApplication.java | 7 +++++-- .../graph/raw/datasourceorganization_resultset_entry.json | 4 ++-- .../dhp/oa/graph/raw/datasources_resultset_entry.json | 4 ++-- .../dhp/oa/graph/raw/organizations_resultset_entry.json | 4 ++-- .../oa/graph/raw/projectorganization_resultset_entry.json | 4 ++-- .../dnetlib/dhp/oa/graph/raw/projects_resultset_entry.json | 4 ++-- 6 files changed, 15 insertions(+), 12 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index da2ba4723..3a6ab25f0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -496,9 +496,12 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i final Boolean deletedbyinference = rs.getBoolean("deletedbyinference"); final String inferenceprovenance = rs.getString("inferenceprovenance"); final Boolean inferred = rs.getBoolean("inferred"); - final String trust = rs.getString("trust"); + + final double trust = rs.getDouble("trust"); + return dataInfo( - deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, trust); + deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, + String.format("%.3f", trust)); } private Qualifier prepareQualifierSplitting(final String s) { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/datasourceorganization_resultset_entry.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/datasourceorganization_resultset_entry.json index 2baf7c8f1..06b0d483b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/datasourceorganization_resultset_entry.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/datasourceorganization_resultset_entry.json @@ -31,8 +31,8 @@ }, { "field": "trust", - "type": "string", - "value": "0.9" + "type": "double", + "value": 0.9 }, { "field": "inferenceprovenance", diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/datasources_resultset_entry.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/datasources_resultset_entry.json index 0f1da7095..23809bb85 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/datasources_resultset_entry.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/datasources_resultset_entry.json @@ -114,8 +114,8 @@ }, { "field": "trust", - "type": "string", - "value": "0.9" + "type": "double", + "value": 0.9 }, { "field": "inferenceprovenance", diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/organizations_resultset_entry.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/organizations_resultset_entry.json index 38657a1e1..811a9079f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/organizations_resultset_entry.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/organizations_resultset_entry.json @@ -96,8 +96,8 @@ }, { "field": "trust", - "type": "string", - "value": "0.9" + "type": "double", + "value": 0.9 }, { "field": "inferenceprovenance", diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/projectorganization_resultset_entry.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/projectorganization_resultset_entry.json index 4311086e7..a3305926d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/projectorganization_resultset_entry.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/projectorganization_resultset_entry.json @@ -41,8 +41,8 @@ }, { "field": "trust", - "type": "string", - "value": "0.9" + "type": "double", + "value": 0.9 }, { "field": "inferenceprovenance", diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/projects_resultset_entry.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/projects_resultset_entry.json index d6109cac1..37480fc94 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/projects_resultset_entry.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/projects_resultset_entry.json @@ -86,8 +86,8 @@ }, { "field": "trust", - "type": "string", - "value": "0.9" + "type": "double", + "value": 0.9 }, { "field": "inferenceprovenance", From cd27df91a1acc8d8c721bbff0954321b63723a85 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 6 Nov 2020 17:12:31 +0100 Subject: [PATCH 02/17] fixed bug on missing relation in ANDS --- .../eu/dnetlib/dhp/doiboost/QueryTest.scala | 9 ++- .../dhp/oa/graph/clean/CleaningFunctions.java | 28 ++++---- .../eu/dnetlib/dhp/sx/graph/IdReplace.scala | 3 + .../sx/graph/SparkSplitOafTODLIEntities.scala | 69 ++++++++++++++++++- .../dhp/sx/graph/ScholexplorerParserTest.java | 23 +++++++ .../resources/eu/dnetlib/dhp/sx/graph/dmf.xml | 67 ++++++++---------- .../resources/eu/dnetlib/dhp/sx/graph/pmf.xml | 25 +++++++ .../dhp/provision/scholix/Scholix.java | 26 ++++--- .../dhp/sx/index/oozie_app/config-default.xml | 4 ++ .../dhp/sx/index/oozie_app/workflow.xml | 47 ++++++++----- .../dhp/sx/provision/oozie_app/workflow.xml | 54 --------------- 11 files changed, 216 insertions(+), 139 deletions(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/IdReplace.scala create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/pmf.xml diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/QueryTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/QueryTest.scala index b3402ee9f..61c1f5111 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/QueryTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/QueryTest.scala @@ -1,6 +1,6 @@ package eu.dnetlib.dhp.doiboost -import eu.dnetlib.dhp.schema.oaf.Publication +import eu.dnetlib.dhp.schema.oaf.{Publication, Relation} import org.apache.spark.SparkContext import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig} @@ -21,6 +21,13 @@ class QueryTest { + } + + + def has_ands(r:Relation) :Boolean = { + + r.getCollectedfrom!= null && r.getCollectedfrom.asScala.count(k => k.getValue.contains("Australian")) > 0 + } def hasInstanceWithUrl(p:Publication):Boolean = { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java index 12543d8c7..56a4aaf5a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java @@ -109,20 +109,20 @@ public class CleaningFunctions { } if (Objects.nonNull(r.getPid())) { r - .setPid( - r - .getPid() - .stream() - .filter(Objects::nonNull) - .filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue()))) - .filter(sp -> NONE.equalsIgnoreCase(sp.getValue())) - .filter(sp -> Objects.nonNull(sp.getQualifier())) - .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) - .map(sp -> { - sp.setValue(StringUtils.trim(sp.getValue())); - return sp; - }) - .collect(Collectors.toList())); + .setPid( + r + .getPid() + .stream() + .filter(Objects::nonNull) + .filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue()))) + .filter(sp -> NONE.equalsIgnoreCase(sp.getValue())) + .filter(sp -> Objects.nonNull(sp.getQualifier())) + .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) + .map(sp -> { + sp.setValue(StringUtils.trim(sp.getValue())); + return sp; + }) + .collect(Collectors.toList())); } if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) { r diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/IdReplace.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/IdReplace.scala new file mode 100644 index 000000000..8d375600c --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/IdReplace.scala @@ -0,0 +1,3 @@ +package eu.dnetlib.dhp.sx.graph + +case class IdReplace(newId:String, oldId:String) {} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSplitOafTODLIEntities.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSplitOafTODLIEntities.scala index 822b16263..f359f73f9 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSplitOafTODLIEntities.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSplitOafTODLIEntities.scala @@ -1,12 +1,15 @@ package eu.dnetlib.dhp.sx.graph import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation} +import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result} import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown} import eu.dnetlib.dhp.sx.ebi.EBIAggregator import org.apache.commons.io.IOUtils +import org.apache.commons.lang3.StringUtils import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.slf4j.LoggerFactory +import org.apache.spark.sql.functions.col + object SparkSplitOafTODLIEntities { @@ -83,14 +86,42 @@ object SparkSplitOafTODLIEntities { } + def extract_ids(o:Oaf) :(String, String) = { + + o match { + case p: DLIPublication => + val prefix = StringUtils.substringBefore(p.getId, "|") + val original = StringUtils.substringAfter(p.getOriginalObjIdentifier, "::") + (p.getId, s"$prefix|$original") + case p: DLIDataset => + val prefix = StringUtils.substringBefore(p.getId, "|") + val original = StringUtils.substringAfter(p.getOriginalObjIdentifier, "::") + (p.getId, s"$prefix|$original") + case _ =>null + } + } + def extract_relations(spark:SparkSession, workingPath:String) :Unit = { implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation] + import spark.implicits._ val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf] val ebi_relation:Dataset[Relation] = spark.read.load(s"$workingPath/ebi/baseline_relation_ebi").as[Relation].repartition(2000) + + OAFDataset + .filter(o => o.isInstanceOf[Result]) + .map(extract_ids)(Encoders.tuple(Encoders.STRING, Encoders.STRING)) + .filter(r => r != null) + .where("_1 != _2") + .select(col("_1").alias("newId"), col("_2").alias("oldId")) + .distinct() + .map(f => IdReplace(f.getString(0), f.getString(1))) + .write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/id_replace") + + OAFDataset .filter(s => s != null && s.isInstanceOf[Relation]) .map(s =>s.asInstanceOf[Relation]) @@ -100,7 +131,41 @@ object SparkSplitOafTODLIEntities { .agg(EBIAggregator.getRelationAggregator().toColumn) .map(p => p._2) .repartition(4000) - .write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/relation") + .write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/relation_unfixed") + + + val relations = spark.read.load(s"$workingPath/graph/relation_unfixed").as[Relation] + val ids = spark.read.load(s"$workingPath/graph/id_replace").as[IdReplace] + + relations + .map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relEncoder)) + .joinWith(ids, col("_1").equalTo(ids("oldId")), "left") + .map(i =>{ + val r = i._1._2 + if (i._2 != null) + { + val id = i._2.newId + r.setSource(id) + } + r + }).write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/rel_f_source") + + val rel_source:Dataset[Relation] = spark.read.load(s"$workingPath/graph/rel_f_source").as[Relation] + + rel_source + .map(r => (r.getTarget, r))(Encoders.tuple(Encoders.STRING, relEncoder)) + .joinWith(ids, col("_1").equalTo(ids("oldId")), "left") + .map(i =>{ + val r:Relation = i._1._2 + if (i._2 != null) + { + val id = i._2.newId + r.setTarget(id) + } + r + }).write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/relation") + + } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java index d418da594..67226a031 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java @@ -12,6 +12,7 @@ import com.fasterxml.jackson.databind.SerializationFeature; import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.sx.graph.parser.DatasetScholexplorerParser; +import eu.dnetlib.dhp.sx.graph.parser.PublicationScholexplorerParser; import eu.dnetlib.scholexplorer.relation.RelationMapper; public class ScholexplorerParserTest { @@ -37,4 +38,26 @@ public class ScholexplorerParserTest { } }); } + + @Test + public void testPublicationParser() throws Exception { + String xml = IOUtils.toString(this.getClass().getResourceAsStream("pmf.xml")); + + PublicationScholexplorerParser p = new PublicationScholexplorerParser(); + List oaves = p.parseObject(xml, RelationMapper.load()); + + ObjectMapper m = new ObjectMapper(); + m.enable(SerializationFeature.INDENT_OUTPUT); + + oaves + .forEach( + oaf -> { + try { + System.out.println(m.writeValueAsString(oaf)); + System.out.println("----------------------------"); + } catch (JsonProcessingException e) { + + } + }); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/dmf.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/dmf.xml index 836e0b9a0..503d44bf7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/dmf.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/dmf.xml @@ -1,51 +1,38 @@ - - aaadf8b3-01a8-4cc2-9964-63cfb19df3b4_UmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZXMvUmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZVR5cGU= - oai:pangaea.de:doi:10.1594/PANGAEA.432865 - r3d100010134 - r3d100010134::00002f60593fd1f758fb838fafb46795 - 2020-02-18T03:05:02.534Z - - oai:pangaea.de:doi:10.1594/PANGAEA.432865 - citable topicOceans + xmlns="http://namespace.openaire.eu/"> + + r3d100010464::0002882a9d38c4f4612e7666ad768ccd + https://research.jcu.edu.au/researchdata/published/detail/9079e05370d830eb8d416c77c0b761ce::url + 2020-11-02T16:14:07.831Z + ands_UmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZXMvUmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZVR5cGU= + r3d100010464 - - - 10.1594/pangaea.432865 - - Daily sea level from coastal tide gauge station Woods_Hole in 1978 (Research quality database) + + + https://research.jcu.edu.au/researchdata/published/detail/9079e05370d830eb8d416c77c0b761ce + + Vertebrate monitoring in the Australian Wet Tropics rainforest at CU6A1 (145.30367623, -16.57767628, 600.0m above MSL) collected by Reptile Surveys - PANGAEA - Data Publisher for Earth & Environmental Science - 2006 - - 1978-01-01T12:00:00/1978-12-31T12:00:00 + James Cook University + + 2013-05-07 - - - WOCE Sea Level, WSL - - - - DATE/TIME - Sea level - Tide gauge station - SeaLevel - World Ocean Circulation Experiment (WOCE) - - - - http://store.pangaea.de/Projects/WOCE/SeaLevel_rqds/Woods_Hole.txt + + Dataset + + r3d100010464::57793c5aa995172db237d9da17353f8b - - + + - + complete collected diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/pmf.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/pmf.xml new file mode 100644 index 000000000..7b9abd158 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/pmf.xml @@ -0,0 +1,25 @@ + + + + r3d100010464::57793c5aa995172db237d9da17353f8b + 10.1111/j.1365-2486.2005.00995.x::doi + 2020-11-02T16:14:07.831Z + ands_UmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZXMvUmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZVR5cGU= + r3d100010464 + + + 10.1111/j.1365-2486.2005.00995.x + 10.1111/j.1365-2486.2005.00995.x + Potential decoupling of trends in distribution area and population size of species with climate change. + publication + + + + + complete + collected + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/Scholix.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/Scholix.java index 6ea8ff735..ec3da5cfc 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/Scholix.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/Scholix.java @@ -97,12 +97,17 @@ public class Scholix implements Serializable { } private List mergeScholixEntityId(final List a, final List b) { - final List m = new ArrayList<>(a); + final List m = a != null ? new ArrayList<>(a) : new ArrayList<>(); if (b != null) b.forEach(s -> { - int tt = (int) m.stream().filter(t -> t.getName().equalsIgnoreCase(s.getName())).count(); - if (tt == 0) { - m.add(s); + if (s != null) { + int tt = (int) m + .stream() + .filter(t -> t != null && t.getName() != null && t.getName().equalsIgnoreCase(s.getName())) + .count(); + if (tt == 0) { + m.add(s); + } } }); return m; @@ -110,7 +115,7 @@ public class Scholix implements Serializable { private List mergeScholixIdnetifier(final List a, final List b) { - final List m = new ArrayList<>(a); + final List m = a != null ? new ArrayList<>(a) : new ArrayList<>(); if (b != null) b.forEach(s -> { int tt = (int) m.stream().filter(t -> t.getIdentifier().equalsIgnoreCase(s.getIdentifier())).count(); @@ -123,7 +128,7 @@ public class Scholix implements Serializable { private List mergeScholixCollectedFrom(final List a, final List b) { - final List m = new ArrayList<>(a); + final List m = a != null ? new ArrayList<>(a) : new ArrayList<>(); if (b != null) b.forEach(s -> { int tt = (int) m @@ -139,14 +144,15 @@ public class Scholix implements Serializable { private ScholixRelationship mergeRelationships(final ScholixRelationship a, final ScholixRelationship b) { ScholixRelationship result = new ScholixRelationship(); - result.setName(StringUtils.isEmpty(a.getName()) ? b.getName() : a.getName()); - result.setInverse(StringUtils.isEmpty(a.getInverse()) ? b.getInverse() : a.getInverse()); - result.setSchema(StringUtils.isEmpty(a.getSchema()) ? b.getSchema() : a.getSchema()); + result.setName(a == null || StringUtils.isEmpty(a.getName()) ? b.getName() : a.getName()); + result.setInverse(a == null || StringUtils.isEmpty(a.getInverse()) ? b.getInverse() : a.getInverse()); + result.setSchema(a == null || StringUtils.isEmpty(a.getSchema()) ? b.getSchema() : a.getSchema()); return result; } private ScholixResource mergeResource(final ScholixResource a, final ScholixResource b) { - + if (a == null) + return b; final ScholixResource result = new ScholixResource(); result.setCollectedFrom(mergeScholixCollectedFrom(a.getCollectedFrom(), b.getCollectedFrom())); result.setCreator(mergeScholixEntityId(a.getCreator(), b.getCreator())); diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/index/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/index/oozie_app/config-default.xml index 6fb2a1253..7c1a43e51 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/index/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/index/oozie_app/config-default.xml @@ -7,4 +7,8 @@ oozie.action.sharelib.for.spark spark2 + + oozie.launcher.mapreduce.user.classpath.first + true + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/index/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/index/oozie_app/workflow.xml index 4f5c7bbf6..d98164afb 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/index/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/index/oozie_app/workflow.xml @@ -1,9 +1,17 @@ - + workingDirPath the source path + + index + the index name + + + esCluster + the Index cluster + sparkDriverMemory memory for driver process @@ -12,39 +20,43 @@ sparkExecutorMemory memory for individual executor - - index - index name - - - - indexHost - index host name - - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.provision.DropAndCreateESIndex + -i${index} + -c${esCluster} + + + + + + ${jobTracker} ${nameNode} yarn-cluster cluster - index Summary + index summary eu.dnetlib.dhp.provision.SparkIndexCollectionOnES dhp-graph-provision-scholexplorer-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="32" + --executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="8" -mt yarn-cluster - --sourcePath${workingDirPath}/summary + --sourcePath${workingDirPath}/summary_json --index${index}_object - --esHost${indexHost} --idPathid - --typesummary + --cluster${esCluster} @@ -63,9 +75,8 @@ -mt yarn-cluster --sourcePath${workingDirPath}/scholix_json --index${index}_scholix - --esHost${indexHost} --idPathidentifier - --typescholix + --cluster${esCluster} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/provision/oozie_app/workflow.xml index c2c2a78fb..4c0d6c1da 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/provision/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/provision/oozie_app/workflow.xml @@ -112,59 +112,5 @@ - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.dhp.provision.DropAndCreateESIndex - -i${index} - -c${esCluster} - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - index summary - eu.dnetlib.dhp.provision.SparkIndexCollectionOnES - dhp-graph-provision-scholexplorer-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="8" - -mt yarn-cluster - --sourcePath${workingDirPath}/summary_json - --index${index}_object - --idPathid - --cluster${esCluster} - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - index scholix - eu.dnetlib.dhp.provision.SparkIndexCollectionOnES - dhp-graph-provision-scholexplorer-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="8" - -mt yarn-cluster - --sourcePath${workingDirPath}/scholix_json - --index${index}_scholix - --idPathidentifier - --cluster${esCluster} - - - - - \ No newline at end of file From 40160d171f736e38834dcfcd568c45a0c97056c7 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Mon, 9 Nov 2020 12:58:36 +0100 Subject: [PATCH 03/17] organizations pids --- .../raw/MigrateDbEntitiesApplication.java | 21 ++++++------ .../dhp/oa/graph/sql/queryOrganizations.sql | 32 +++++++++++++++---- 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index adf7b92be..6365a1db9 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -38,7 +38,11 @@ import java.io.IOException; import java.sql.Array; import java.sql.ResultSet; import java.sql.SQLException; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; +import java.util.List; +import java.util.Objects; import java.util.function.Consumer; import java.util.function.Function; import java.util.function.Predicate; @@ -174,7 +178,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i execute(sqlFile, producer, oaf -> true); } - public void execute(final String sqlFile, final Function> producer, + public void execute(final String sqlFile, + final Function> producer, final Predicate predicate) throws Exception { final String sql = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/sql/" + sqlFile)); @@ -198,8 +203,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i ds .setOriginalId( Arrays - .asList( - (String[]) rs.getArray("identities").getArray()) + .asList((String[]) rs.getArray("identities").getArray()) .stream() .filter(StringUtils::isNotBlank) .collect(Collectors.toList())); @@ -250,11 +254,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i ds .setJournal( journal( - rs.getString("officialname"), - rs.getString("issnPrinted"), - rs.getString("issnOnline"), - rs.getString("issnLinking"), - info)); // Journal + rs.getString("officialname"), rs.getString("issnPrinted"), rs.getString("issnOnline"), + rs.getString("issnLinking"), info)); // Journal ds.setDataInfo(info); ds.setLastupdatetimestamp(lastUpdateTimestamp); @@ -332,7 +333,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i listKeyValues( createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname"))); - o.setPid(new ArrayList<>()); + o.setPid(prepareListOfStructProps(rs.getArray("pid"), info)); o.setDateofcollection(asString(rs.getDate("dateofcollection"))); o.setDateoftransformation(asString(rs.getDate("dateoftransformation"))); o.setExtraInfo(new ArrayList<>()); // Values not present in the DB diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOrganizations.sql b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOrganizations.sql index 3e5de8071..938744b11 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOrganizations.sql +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOrganizations.sql @@ -24,12 +24,30 @@ SELECT d.officialname AS collectedfromname, o.country || '@@@dnet:countries' AS country, 'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction, - ARRAY[]::text[] AS pid - + array_remove(array_agg(DISTINCT i.pid || '###' || i.issuertype), NULL) AS pid FROM dsm_organizations o LEFT OUTER JOIN dsm_datasources d ON (d.id = o.collectedfrom) - - - - - + LEFT OUTER JOIN dsm_organizationpids p ON (p.organization = o.id) + LEFT OUTER JOIN dsm_identities i ON (i.pid = p.pid) +GROUP BY + o.id, + o.legalshortname, + o.legalname, + o.websiteurl, + o.logourl, + o.ec_legalbody, + o.ec_legalperson, + o.ec_nonprofit, + o.ec_researchorganization, + o.ec_highereducation, + o.ec_internationalorganizationeurinterests, + o.ec_internationalorganization, + o.ec_enterprise, + o.ec_smevalidated, + o.ec_nutscode, + o.dateofcollection, + o.lastupdate, + o.trust, + d.id, + d.officialname, + o.country From 18d9aad70cfce5c21e23c36c32a3275722e6103f Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 10 Nov 2020 11:48:55 +0100 Subject: [PATCH 04/17] improved documentation in dhp-graph-provision --- dhp-workflows/dhp-graph-provision/README.md | 21 +++++++++++++++++ .../oa/provision/AdjacencyListBuilderJob.java | 20 +--------------- .../CreateRelatedEntitiesJob_phase1.java | 23 +++---------------- .../CreateRelatedEntitiesJob_phase2.java | 20 +--------------- .../dhp/oa/provision/PrepareRelationsJob.java | 22 ++---------------- .../dhp/oa/provision/XmlConverterJob.java | 18 +-------------- 6 files changed, 29 insertions(+), 95 deletions(-) create mode 100644 dhp-workflows/dhp-graph-provision/README.md diff --git a/dhp-workflows/dhp-graph-provision/README.md b/dhp-workflows/dhp-graph-provision/README.md new file mode 100644 index 000000000..973a5909d --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/README.md @@ -0,0 +1,21 @@ +Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The +operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and +all the possible relationships (similarity links produced by the Dedup process are excluded). + +The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and +again by E, finally grouped by E.id; + +The workflow is organized in different parts aimed to to reduce the complexity of the operation + +1) PrepareRelationsJob: only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == +false), each entity can be linked at most to 100 other objects + +2) CreateRelatedEntitiesJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type +E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join (R.target = +T_i.id) save the tuples (R_i, T_i) (phase 2): create the union of all the entity types E, hash by id read the tuples +(R, T), hash by R.source join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) + +3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the +result as JoinedEntity + +4) XmlConverterJob: convert the JoinedEntities as XML records diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java index d9cc03cd5..0bc270e8f 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java @@ -25,26 +25,8 @@ import scala.collection.JavaConverters; import scala.collection.Seq; /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The - * operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and - * all the possible relationships (similarity links produced by the Dedup process are excluded). - *

- * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and - * again by E, finally grouped by E.id; - *

- * The workflow is organized in different parts aimed to to reduce the complexity of the operation 1) - * PrepareRelationsJob: only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == - * false), each entity can be linked at most to 100 other objects - *

- * 2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type - * E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join (R.target = - * T_i.id) save the tuples (R_i, T_i) (phase 2): create the union of all the entity types E, hash by id read the tuples - * (R, T), hash by R.source join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) - *

- * 3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the + * AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the * result as JoinedEntity - *

- * 4) XmlConverterJob: convert the JoinedEntities as XML records */ public class AdjacencyListBuilderJob { diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java index b08e593f7..86d380409 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java @@ -31,26 +31,9 @@ import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The - * operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and - * all the possible relationships (similarity links produced by the Dedup process are excluded). - *

- * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and - * again by E, finally grouped by E.id; - *

- * The workflow is organized in different parts aimed to to reduce the complexity of the operation 1) - * PrepareRelationsJob: only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == - * false), each entity can be linked at most to 100 other objects - *

- * 2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type - * E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join (R.target = - * T_i.id) save the tuples (R_i, T_i) (phase 2): create the union of all the entity types E, hash by id read the tuples - * (R, T), hash by R.source join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) - *

- * 3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the - * result as JoinedEntity - *

- * 4) XmlConverterJob: convert the JoinedEntities as XML records + * CreateRelatedEntitiesJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type + * E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join + * (R.target = T_i.id) save the tuples (R_i, T_i) */ public class CreateRelatedEntitiesJob_phase1 { diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java index 7e175121e..170835fdb 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java @@ -34,26 +34,8 @@ import scala.collection.JavaConverters; import scala.collection.Seq; /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The - * operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and - * all the possible relationships (similarity links produced by the Dedup process are excluded). - *

- * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and - * again by E, finally grouped by E.id; - *

- * The workflow is organized in different parts aimed to to reduce the complexity of the operation 1) - * PrepareRelationsJob: only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == - * false), each entity can be linked at most to 100 other objects - *

- * 2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type - * E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join (R.target = - * T_i.id) save the tuples (R_i, T_i) (phase 2): create the union of all the entity types E, hash by id read the tuples + * CreateRelatedEntitiesJob (phase 2): create the union of all the entity types E, hash by id read the tuples * (R, T), hash by R.source join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) - *

- * 3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the - * result as JoinedEntity - *

- * 4) XmlConverterJob: convert the JoinedEntities as XML records */ public class CreateRelatedEntitiesJob_phase2 { diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java index da0a81021..20d27f0f3 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java @@ -36,26 +36,8 @@ import eu.dnetlib.dhp.schema.oaf.Relation; import scala.Tuple2; /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The - * operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and - * all the possible relationships (similarity links produced by the Dedup process are excluded). - *

- * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and - * again by E, finally grouped by E.id; - *

- * The workflow is organized in different parts aimed to to reduce the complexity of the operation 1) - * PrepareRelationsJob: only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == - * false), each entity can be linked at most to 100 other objects - *

- * 2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type - * E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join (R.target = - * T_i.id) save the tuples (R_i, T_i) (phase 2): create the union of all the entity types E, hash by id read the tuples - * (R, T), hash by R.source join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) - *

- * 3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the - * result as JoinedEntity - *

- * 4) XmlConverterJob: convert the JoinedEntities as XML records + * PrepareRelationsJob prunes the relationships: only consider relationships that are not virtually deleted + * ($.dataInfo.deletedbyinference == false), each entity can be linked at most to 100 other objects */ public class PrepareRelationsJob { diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java index a1ed7fd2a..d8eba31b6 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java @@ -37,23 +37,7 @@ import scala.collection.JavaConverters; import scala.collection.Seq; /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The - * operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and - * all the possible relationships (similarity links produced by the Dedup process are excluded). - *

- * The workflow is organized in different parts aimed to to reduce the complexity of the operation 1) - * PrepareRelationsJob: only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == - * false), each entity can be linked at most to 100 other objects - *

- * 2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type - * E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join (R.target = - * T_i.id) save the tuples (R_i, T_i) (phase 2): create the union of all the entity types E, hash by id read the tuples - * (R, T), hash by R.source join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) - *

- * 3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the - * result as JoinedEntity - *

- * 4) XmlConverterJob: convert the JoinedEntities as XML records + * XmlConverterJob converts the JoinedEntities as XML records */ public class XmlConverterJob { From e603fa5847168634596f10b0dc5856160714cd15 Mon Sep 17 00:00:00 2001 From: antleb Date: Tue, 10 Nov 2020 17:11:12 +0200 Subject: [PATCH 05/17] fixes in db creation --- dhp-workflows/dhp-stats-update/pom.xml | 2 +- .../graph/stats/oozie_app/scripts/step17.sql | 44 +++++++++++ .../graph/stats/oozie_app/scripts/step18.sql | 75 +------------------ .../graph/stats/oozie_app/scripts/step2.sql | 25 +++++-- .../graph/stats/oozie_app/scripts/step3.sql | 16 ++-- .../graph/stats/oozie_app/scripts/step4.sql | 16 ++-- .../graph/stats/oozie_app/scripts/step5.sql | 17 ++--- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 27 ++++++- 8 files changed, 113 insertions(+), 109 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/pom.xml b/dhp-workflows/dhp-stats-update/pom.xml index 52f35ff07..d6ec4e6ab 100644 --- a/dhp-workflows/dhp-stats-update/pom.xml +++ b/dhp-workflows/dhp-stats-update/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.4-SNAPSHOT + 1.2.2-SNAPSHOT 4.0.0 dhp-stats-update diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step17.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step17.sql index e002f656e..5c102d014 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step17.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step17.sql @@ -5,8 +5,12 @@ ------------------------------------------------------ -- Dropping old views +DROP VIEW IF EXISTS ${stats_db_shadow_name}.category; +DROP VIEW IF EXISTS ${stats_db_shadow_name}.concept; +DROP VIEW IF EXISTS ${stats_db_shadow_name}.context; DROP VIEW IF EXISTS ${stats_db_shadow_name}.country; DROP VIEW IF EXISTS ${stats_db_shadow_name}.countrygdp; +DROP VIEW IF EXISTS ${stats_db_shadow_name}.creation_date; DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset; DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_citations; DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_classifications; @@ -16,6 +20,7 @@ DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_languages; DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_licenses; DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_oids; DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_pids; +DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_refereed; DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_sources; DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_topics; DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource; @@ -23,11 +28,15 @@ DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource_languages; DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource_oids; DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource_organizations; DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource_results; +DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource_sources; +DROP VIEW IF EXISTS ${stats_db_shadow_name}.funder; DROP VIEW IF EXISTS ${stats_db_shadow_name}.fundref; DROP VIEW IF EXISTS ${stats_db_shadow_name}.numbers_country; DROP VIEW IF EXISTS ${stats_db_shadow_name}.organization; DROP VIEW IF EXISTS ${stats_db_shadow_name}.organization_datasources; +DROP VIEW IF EXISTS ${stats_db_shadow_name}.organization_pids; DROP VIEW IF EXISTS ${stats_db_shadow_name}.organization_projects; +DROP VIEW IF EXISTS ${stats_db_shadow_name}.organization_sources; DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct; DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_citations; DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_classifications; @@ -37,12 +46,15 @@ DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_languages; DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_licenses; DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_oids; DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_pids; +DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_refereed; DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_sources; DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_topics; DROP VIEW IF EXISTS ${stats_db_shadow_name}.project; DROP VIEW IF EXISTS ${stats_db_shadow_name}.project_oids; DROP VIEW IF EXISTS ${stats_db_shadow_name}.project_organizations; DROP VIEW IF EXISTS ${stats_db_shadow_name}.project_results; +DROP VIEW IF EXISTS ${stats_db_shadow_name}.project_resultcount; +DROP VIEW IF EXISTS ${stats_db_shadow_name}.project_results_publication; DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication; DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_citations; DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_classifications; @@ -52,19 +64,28 @@ DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_languages; DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_licenses; DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_oids; DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_pids; +DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_refereed; DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_sources; DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_topics; DROP VIEW IF EXISTS ${stats_db_shadow_name}.result; +DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_affiliated_country; DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_citations; DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_classifications; DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_concepts; DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_datasources; +DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_deposited_country; +DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_fundercount; +DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_gold; +DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_greenoa; DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_languages; DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_licenses; DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_oids; DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_organization; +DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_peerreviewed; DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_pids; +DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_projectcount; DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_projects; +DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_refereed; DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_sources; DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_topics; DROP VIEW IF EXISTS ${stats_db_shadow_name}.rndexpediture; @@ -78,6 +99,7 @@ DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_languages; DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_licenses; DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_oids; DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_pids; +DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_refereed; DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_sources; DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_topics; @@ -86,8 +108,12 @@ DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_topics; CREATE database IF NOT EXISTS ${stats_db_shadow_name}; -- Creating new views +CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.category AS SELECT * FROM ${stats_db_name}.category; +CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.concept AS SELECT * FROM ${stats_db_name}.concept; +CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.context AS SELECT * FROM ${stats_db_name}.context; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.country AS SELECT * FROM ${stats_db_name}.country; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.countrygdp AS SELECT * FROM ${stats_db_name}.countrygdp; +CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.creation_date AS SELECT * FROM ${stats_db_name}.creation_date; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset AS SELECT * FROM ${stats_db_name}.dataset; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_citations AS SELECT * FROM ${stats_db_name}.dataset_citations; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_classifications AS SELECT * FROM ${stats_db_name}.dataset_classifications; @@ -97,6 +123,7 @@ CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_languages AS SELECT * CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_licenses AS SELECT * FROM ${stats_db_name}.dataset_licenses; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_oids AS SELECT * FROM ${stats_db_name}.dataset_oids; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_pids AS SELECT * FROM ${stats_db_name}.dataset_pids; +CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_refereed AS SELECT * FROM ${stats_db_name}.dataset_refereed; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_sources AS SELECT * FROM ${stats_db_name}.dataset_sources; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_topics AS SELECT * FROM ${stats_db_name}.dataset_topics; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource AS SELECT * FROM ${stats_db_name}.datasource; @@ -104,11 +131,15 @@ CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource_languages AS SELECT CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource_oids AS SELECT * FROM ${stats_db_name}.datasource_oids; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource_organizations AS SELECT * FROM ${stats_db_name}.datasource_organizations; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource_results AS SELECT * FROM ${stats_db_name}.datasource_results; +CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource_sources AS SELECT * FROM ${stats_db_name}.datasource_sources; +CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.funder AS SELECT * FROM ${stats_db_name}.funder; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.fundref AS SELECT * FROM ${stats_db_name}.fundref; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.numbers_country AS SELECT * FROM ${stats_db_name}.numbers_country; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.organization AS SELECT * FROM ${stats_db_name}.organization; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.organization_datasources AS SELECT * FROM ${stats_db_name}.organization_datasources; +CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.organization_pids AS SELECT * FROM ${stats_db_name}.organization_pids; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.organization_projects AS SELECT * FROM ${stats_db_name}.organization_projects; +CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.organization_sources AS SELECT * FROM ${stats_db_name}.organization_sources; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct AS SELECT * FROM ${stats_db_name}.otherresearchproduct; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_citations AS SELECT * FROM ${stats_db_name}.otherresearchproduct_citations; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_classifications AS SELECT * FROM ${stats_db_name}.otherresearchproduct_classifications; @@ -118,12 +149,15 @@ CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_languages CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_licenses AS SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_oids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_oids; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_pids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_pids; +CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_refereed AS SELECT * FROM ${stats_db_name}.otherresearchproduct_refereed; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_sources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_topics AS SELECT * FROM ${stats_db_name}.otherresearchproduct_topics; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project AS SELECT * FROM ${stats_db_name}.project; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project_oids AS SELECT * FROM ${stats_db_name}.project_oids; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project_organizations AS SELECT * FROM ${stats_db_name}.project_organizations; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project_results AS SELECT * FROM ${stats_db_name}.project_results; +CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project_resultcount AS SELECT * FROM ${stats_db_name}.project_resultcount; +CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project_results_publication AS SELECT * FROM ${stats_db_name}.project_results_publication; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication AS SELECT * FROM ${stats_db_name}.publication; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_citations AS SELECT * FROM ${stats_db_name}.publication_citations; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_classifications AS SELECT * FROM ${stats_db_name}.publication_classifications; @@ -133,19 +167,28 @@ CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_languages AS SELEC CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_licenses AS SELECT * FROM ${stats_db_name}.publication_licenses; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_oids AS SELECT * FROM ${stats_db_name}.publication_oids; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_pids AS SELECT * FROM ${stats_db_name}.publication_pids; +CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_refereed AS SELECT * FROM ${stats_db_name}.publication_refereed; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_sources AS SELECT * FROM ${stats_db_name}.publication_sources; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_topics AS SELECT * FROM ${stats_db_name}.publication_topics; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result AS SELECT * FROM ${stats_db_name}.result; +CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_affiliated_country AS SELECT * FROM ${stats_db_name}.result_affiliated_country; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_citations AS SELECT * FROM ${stats_db_name}.result_citations; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_classifications AS SELECT * FROM ${stats_db_name}.result_classifications; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_concepts AS SELECT * FROM ${stats_db_name}.result_concepts; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_datasources AS SELECT * FROM ${stats_db_name}.result_datasources; +CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_deposited_country AS SELECT * FROM ${stats_db_name}.result_deposited_country; +CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_fundercount AS SELECT * FROM ${stats_db_name}.result_fundercount; +CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_gold AS SELECT * FROM ${stats_db_name}.result_gold; +CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_greenoa AS SELECT * FROM ${stats_db_name}.result_greenoa; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_languages AS SELECT * FROM ${stats_db_name}.result_languages; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_licenses AS SELECT * FROM ${stats_db_name}.result_licenses; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_oids AS SELECT * FROM ${stats_db_name}.result_oids; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_organization AS SELECT * FROM ${stats_db_name}.result_organization; +CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_peerreviewed AS SELECT * FROM ${stats_db_name}.result_peerreviewed; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_pids AS SELECT * FROM ${stats_db_name}.result_pids; +CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_projectcount AS SELECT * FROM ${stats_db_name}.result_projectcount; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_projects AS SELECT * FROM ${stats_db_name}.result_projects; +CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_refereed AS SELECT * FROM ${stats_db_name}.result_refereed; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_sources AS SELECT * FROM ${stats_db_name}.result_sources; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_topics AS SELECT * FROM ${stats_db_name}.result_topics; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.rndexpediture AS SELECT * FROM ${stats_db_name}.rndexpediture; @@ -159,5 +202,6 @@ CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_languages AS SELECT * CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_licenses AS SELECT * FROM ${stats_db_name}.software_licenses; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_oids AS SELECT * FROM ${stats_db_name}.software_oids; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_pids AS SELECT * FROM ${stats_db_name}.software_pids; +CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_refereed AS SELECT * FROM ${stats_db_name}.software_refereed; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_sources AS SELECT * FROM ${stats_db_name}.software_sources; CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_topics AS SELECT * FROM ${stats_db_name}.software_topics; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step18.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step18.sql index 5645db309..34e48a18a 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step18.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step18.sql @@ -5,77 +5,4 @@ ------------------------------------------------------ ------------------------------------------------------ -COMPUTE STATS country; -COMPUTE STATS countrygdp; -COMPUTE STATS dataset; -COMPUTE STATS dataset_citations; -COMPUTE STATS dataset_classifications; -COMPUTE STATS dataset_concepts; -COMPUTE STATS dataset_datasources; -COMPUTE STATS dataset_languages; -COMPUTE STATS dataset_oids; -COMPUTE STATS dataset_pids; -COMPUTE STATS dataset_sources; -COMPUTE STATS dataset_topics; -COMPUTE STATS datasource; -COMPUTE STATS datasource_languages; -COMPUTE STATS datasource_oids; -COMPUTE STATS datasource_organizations; -COMPUTE STATS datasource_results; -COMPUTE STATS fundref; -COMPUTE STATS numbers_country; -COMPUTE STATS organization; -COMPUTE STATS organization_datasources; -COMPUTE STATS organization_projects; -COMPUTE STATS otherresearchproduct; -COMPUTE STATS otherresearchproduct_citations; -COMPUTE STATS otherresearchproduct_classifications; -COMPUTE STATS otherresearchproduct_concepts; -COMPUTE STATS otherresearchproduct_datasources; -COMPUTE STATS otherresearchproduct_languages; -COMPUTE STATS otherresearchproduct_licenses; -COMPUTE STATS otherresearchproduct_oids; -COMPUTE STATS otherresearchproduct_pids; -COMPUTE STATS otherresearchproduct_sources; -COMPUTE STATS otherresearchproduct_topics; -COMPUTE STATS project; -COMPUTE STATS project_oids; -COMPUTE STATS project_organizations; -COMPUTE STATS project_results; -COMPUTE STATS publication; -COMPUTE STATS publication_citations; -COMPUTE STATS publication_classifications; -COMPUTE STATS publication_concepts; -COMPUTE STATS publication_datasources; -COMPUTE STATS publication_languages; -COMPUTE STATS publication_licenses; -COMPUTE STATS publication_oids; -COMPUTE STATS publication_pids; -COMPUTE STATS publication_sources; -COMPUTE STATS publication_topics; -COMPUTE STATS result; -COMPUTE STATS result_citations; -COMPUTE STATS result_classifications; -COMPUTE STATS result_concepts; -COMPUTE STATS result_datasources; -COMPUTE STATS result_languages; -COMPUTE STATS result_licenses; -COMPUTE STATS result_oids; -COMPUTE STATS result_organization; -COMPUTE STATS result_pids; -COMPUTE STATS result_projects; -COMPUTE STATS result_sources; -COMPUTE STATS result_topics; -COMPUTE STATS rndexpediture; -COMPUTE STATS roarmap; -COMPUTE STATS software; -COMPUTE STATS software_citations; -COMPUTE STATS software_classifications; -COMPUTE STATS software_concepts; -COMPUTE STATS software_datasources; -COMPUTE STATS software_languages; -COMPUTE STATS software_licenses; -COMPUTE STATS software_oids; -COMPUTE STATS software_pids; -COMPUTE STATS software_sources; -COMPUTE STATS software_topics; +INVALIDATE METADATA ${stats_db_name}; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql index 312a8b82e..ba0db25be 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql @@ -17,19 +17,28 @@ case when size(p.description) > 0 then true else false end as abstract, from ${openaire_db_name}.publication p where p.datainfo.deletedbyinference=false; -CREATE TABLE ${stats_db_name}.publication_classifications AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype; +CREATE TABLE ${stats_db_name}.publication_classifications AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype where p.datainfo.deletedbyinference=false; -CREATE TABLE ${stats_db_name}.publication_concepts AS SELECT substr(p.id, 4) as id, contexts.context.id as concept from ${openaire_db_name}.publication p LATERAL VIEW explode(p.context) contexts as context; +CREATE TABLE ${stats_db_name}.publication_concepts AS SELECT substr(p.id, 4) as id, contexts.context.id as concept from ${openaire_db_name}.publication p LATERAL VIEW explode(p.context) contexts as context where p.datainfo.deletedbyinference=false; -CREATE TABLE ${stats_db_name}.publication_datasources as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM (SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance) p LEFT OUTER JOIN (SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id; +CREATE TABLE ${stats_db_name}.publication_datasources as +SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource + FROM ( + SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource + from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance + where p.datainfo.deletedbyinference=false ) p + LEFT OUTER JOIN ( + SELECT substr(d.id, 4) id + from ${openaire_db_name}.datasource d + WHERE d.datainfo.deletedbyinference=false ) d on p.datasource = d.id; -CREATE TABLE ${stats_db_name}.publication_languages AS select substr(p.id, 4) as id, p.language.classname as language FROM ${openaire_db_name}.publication p; +CREATE TABLE ${stats_db_name}.publication_languages AS select substr(p.id, 4) as id, p.language.classname as language FROM ${openaire_db_name}.publication p where p.datainfo.deletedbyinference=false; -CREATE TABLE ${stats_db_name}.publication_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.originalid) oids AS ids; +CREATE TABLE ${stats_db_name}.publication_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference=false; -CREATE TABLE ${stats_db_name}.publication_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.pid) pids AS ppid; +CREATE TABLE ${stats_db_name}.publication_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference=false; -CREATE TABLE ${stats_db_name}.publication_topics as select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.subject) subjects AS subject; +CREATE TABLE ${stats_db_name}.publication_topics as select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference=false; -- Publication_citations -CREATE TABLE ${stats_db_name}.publication_citations AS SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result FROM ${openaire_db_name}.publication p lateral view explode(p.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !=""; \ No newline at end of file +CREATE TABLE ${stats_db_name}.publication_citations AS SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result FROM ${openaire_db_name}.publication p lateral view explode(p.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="" and p.datainfo.deletedbyinference=false; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql index 47a102525..f69715a31 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql @@ -17,20 +17,20 @@ FROM ${openaire_db_name}.dataset d WHERE d.datainfo.deletedbyinference=FALSE; -- Dataset_citations -CREATE TABLE ${stats_db_name}.dataset_citations AS SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result FROM ${openaire_db_name}.dataset d LATERAL VIEW explode(d.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !=""; +CREATE TABLE ${stats_db_name}.dataset_citations AS SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result FROM ${openaire_db_name}.dataset d LATERAL VIEW explode(d.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="" and d.datainfo.deletedbyinference=false; -CREATE TABLE ${stats_db_name}.dataset_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype; +CREATE TABLE ${stats_db_name}.dataset_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference=false; -CREATE TABLE ${stats_db_name}.dataset_concepts AS SELECT substr(p.id, 4) as id, contexts.context.id as concept from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.context) contexts as context; +CREATE TABLE ${stats_db_name}.dataset_concepts AS SELECT substr(p.id, 4) as id, contexts.context.id as concept from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.context) contexts as context where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.dataset_datasources AS SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM (SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) AS datasource -FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance) instances AS instance) p LEFT OUTER JOIN +FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance) instances AS instance where p.datainfo.deletedbyinference=false) p LEFT OUTER JOIN (SELECT substr(d.id, 4) id FROM ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d ON p.datasource = d.id; -CREATE TABLE ${stats_db_name}.dataset_languages AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.dataset p; +CREATE TABLE ${stats_db_name}.dataset_languages AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.dataset p where p.datainfo.deletedbyinference=false; -CREATE TABLE ${stats_db_name}.dataset_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.originalid) oids AS ids; +CREATE TABLE ${stats_db_name}.dataset_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference=false; -CREATE TABLE ${stats_db_name}.dataset_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.pid) pids AS ppid; +CREATE TABLE ${stats_db_name}.dataset_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference=false; -CREATE TABLE ${stats_db_name}.dataset_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.subject) subjects AS subject; +CREATE TABLE ${stats_db_name}.dataset_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference=false; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql index ca1059cc8..2c4a625e1 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql @@ -17,20 +17,20 @@ from ${openaire_db_name}.software s where s.datainfo.deletedbyinference=false; -- Software_citations -CREATE TABLE ${stats_db_name}.software_citations AS SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT FROM ${openaire_db_name}.software s LATERAL VIEW explode(s.extrainfo) citations as citation where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !=""; +CREATE TABLE ${stats_db_name}.software_citations AS SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT FROM ${openaire_db_name}.software s LATERAL VIEW explode(s.extrainfo) citations as citation where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="" and s.datainfo.deletedbyinference=false; -CREATE TABLE ${stats_db_name}.software_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype; +CREATE TABLE ${stats_db_name}.software_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference=false; -CREATE TABLE ${stats_db_name}.software_concepts AS SELECT substr(p.id, 4) AS id, contexts.context.id AS concept FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.context) contexts AS context; +CREATE TABLE ${stats_db_name}.software_concepts AS SELECT substr(p.id, 4) AS id, contexts.context.id AS concept FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.context) contexts AS context where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.software_datasources AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource -FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance) instances AS instance) p LEFT OUTER JOIN +FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance) instances AS instance where p.datainfo.deletedbyinference=false) p LEFT OUTER JOIN (SELECT substr(d.id, 4) id FROM ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d ON p.datasource = d.id; -CREATE TABLE ${stats_db_name}.software_languages AS select substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.software p; +CREATE TABLE ${stats_db_name}.software_languages AS select substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.software p where p.datainfo.deletedbyinference=false; -CREATE TABLE ${stats_db_name}.software_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.originalid) oids AS ids; +CREATE TABLE ${stats_db_name}.software_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference=false; -CREATE TABLE ${stats_db_name}.software_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.pid) pids AS ppid; +CREATE TABLE ${stats_db_name}.software_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference=false; -CREATE TABLE ${stats_db_name}.software_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.subject) subjects AS subject; +CREATE TABLE ${stats_db_name}.software_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference=false; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql index b4fb5aec6..1fa5df8cb 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql @@ -17,21 +17,20 @@ FROM ${openaire_db_name}.otherresearchproduct o WHERE o.datainfo.deletedbyinference=FALSE; -- Otherresearchproduct_citations -CREATE TABLE ${stats_db_name}.otherresearchproduct_citations AS SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !=""; +CREATE TABLE ${stats_db_name}.otherresearchproduct_citations AS SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="" and o.datainfo.deletedbyinference=false; -CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype; - -CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS SELECT substr(p.id, 4) AS id, contexts.context.id AS concept FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context; +CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS SELECT substr(p.id, 4) AS id, contexts.context.id AS concept FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context where p.datainfo.deletedbyinference=false; CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource -from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance) p LEFT OUTER JOIN +from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance where p.datainfo.deletedbyinference=false) p LEFT OUTER JOIN (SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id; -CREATE TABLE ${stats_db_name}.otherresearchproduct_languages AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.otherresearchproduct p; +CREATE TABLE ${stats_db_name}.otherresearchproduct_languages AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.otherresearchproduct p where p.datainfo.deletedbyinference=false; -CREATE TABLE ${stats_db_name}.otherresearchproduct_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids; +CREATE TABLE ${stats_db_name}.otherresearchproduct_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference=false; -CREATE TABLE ${stats_db_name}.otherresearchproduct_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid; +CREATE TABLE ${stats_db_name}.otherresearchproduct_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference=false; -CREATE TABLE ${stats_db_name}.otherresearchproduct_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject; +CREATE TABLE ${stats_db_name}.otherresearchproduct_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference=false; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 174d78901..3101fe35a 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -237,6 +237,17 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} + + + + + + + ${hive_jdbc_url} + + stats_db_name=${stats_db_name} + openaire_db_name=${openaire_db_name} + @@ -262,9 +273,23 @@ /user/${wf:user()}/oa/graph/stats/oozie_app/scripts/step18.sql impala-shell.sh - + + + + ${jobTracker} + ${nameNode} + impala-shell.sh + ${stats_db_shadow_name} + step19.sql + /user/${wf:user()}/oa/graph/stats/oozie_app/scripts/step19.sql + impala-shell.sh + + + + + From c02c7741c911b293fbb2c5ba131fb6087672253f Mon Sep 17 00:00:00 2001 From: antleb Date: Tue, 10 Nov 2020 17:11:30 +0200 Subject: [PATCH 06/17] fixes in db creation --- .../stats/oozie_app/scripts/step16_6.sql | 32 +++++++++++++++++++ .../graph/stats/oozie_app/scripts/step19.sql | 8 +++++ 2 files changed, 40 insertions(+) create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step19.sql diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql new file mode 100644 index 000000000..ced7bbc11 --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql @@ -0,0 +1,32 @@ +------------------------------------------- +--- Extra tables, mostly used by indicators + +create table ${stats_db_name}.result_projectcount as +select r.id, count(distinct p.id) as count +from ${stats_db_name}.result r +left outer join ${stats_db_name}.result_projects rp on rp.id=r.id +left outer join ${stats_db_name}.project p on p.id=rp.project +group by r.id; + +create table ${stats_db_name}.result_fundercount as +select r.id, count(distinct p.funder) as count +from ${stats_db_name}.result r +left outer join ${stats_db_name}.result_projects rp on rp.id=r.id +left outer join ${stats_db_name}.project p on p.id=rp.project +group by r.id; + +create table ${stats_db_name}.project_resultcount as +with rcount as ( + select p.id as pid, count(distinct r.id) as `count`, r.type as type + from ${stats_db_name}.project p + left outer join ${stats_db_name}.result_projects rp on rp.project=p.id + left outer join ${stats_db_name}.result r on r.id=rp.id + group by r.type, p.id ) +select rcount.pid, sum(case when rcount.type='publication' then rcount.count else 0 end) as publications, + sum(case when rcount.type='dataset' then rcount.count else 0 end) as datasets, + sum(case when rcount.type='software' then rcount.count else 0 end) as software, + sum(case when rcount.type='other' then rcount.count else 0 end) as other +from rcount +group by rcount.pid; + +create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step19.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step19.sql new file mode 100644 index 000000000..34e48a18a --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step19.sql @@ -0,0 +1,8 @@ +------------------------------------------------------ +------------------------------------------------------ +-- Impala table statistics - Needed to make the tables +-- visible for impala +------------------------------------------------------ +------------------------------------------------------ + +INVALIDATE METADATA ${stats_db_name}; From f14e65f6a39c5e5abc0a828d3165b221f38ddaca Mon Sep 17 00:00:00 2001 From: antleb Date: Tue, 10 Nov 2020 17:23:04 +0200 Subject: [PATCH 07/17] reverted wrong change --- dhp-workflows/dhp-stats-update/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/pom.xml b/dhp-workflows/dhp-stats-update/pom.xml index d6ec4e6ab..52f35ff07 100644 --- a/dhp-workflows/dhp-stats-update/pom.xml +++ b/dhp-workflows/dhp-stats-update/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.2-SNAPSHOT + 1.2.4-SNAPSHOT 4.0.0 dhp-stats-update From 99ebaee3470c814a6e5dec4b3c6ef4bd87182e51 Mon Sep 17 00:00:00 2001 From: antleb Date: Wed, 11 Nov 2020 16:56:46 +0200 Subject: [PATCH 08/17] fixed #5913 --- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 3101fe35a..20eec37dc 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -46,7 +46,7 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -270,7 +270,7 @@ impala-shell.sh ${stats_db_name} step18.sql - /user/${wf:user()}/oa/graph/stats/oozie_app/scripts/step18.sql + ${wf:appPath()}/scripts/step18.sql impala-shell.sh @@ -284,7 +284,7 @@ impala-shell.sh ${stats_db_shadow_name} step19.sql - /user/${wf:user()}/oa/graph/stats/oozie_app/scripts/step19.sql + ${wf:appPath()}/scripts/step19.sql impala-shell.sh From 822971f54f80cf40421ef0a00898738549492e76 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 12 Nov 2020 09:22:59 +0100 Subject: [PATCH 09/17] no need to filter relations in CreateRelatedEntitiesJob_phase1; replaced 'left outer' join with 'left' join in CreateRelatedEntitiesJob_phase2; cleanup; --- .../oa/provision/AdjacencyListBuilderJob.java | 91 ------------------- .../CreateRelatedEntitiesJob_phase1.java | 1 - .../CreateRelatedEntitiesJob_phase2.java | 4 +- .../dhp/oa/provision/PrepareRelationsJob.java | 13 ++- 4 files changed, 9 insertions(+), 100 deletions(-) delete mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java deleted file mode 100644 index 0bc270e8f..000000000 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java +++ /dev/null @@ -1,91 +0,0 @@ - -package eu.dnetlib.dhp.oa.provision; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - -import java.util.ArrayList; -import java.util.List; -import java.util.Optional; -import java.util.stream.Collectors; - -import org.apache.commons.io.IOUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.api.java.function.MapGroupsFunction; -import org.apache.spark.sql.*; -import org.apache.spark.sql.expressions.Aggregator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.provision.model.*; -import scala.Tuple2; -import scala.collection.JavaConverters; -import scala.collection.Seq; - -/** - * AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the - * result as JoinedEntity - */ -public class AdjacencyListBuilderJob { - - private static final Logger log = LoggerFactory.getLogger(AdjacencyListBuilderJob.class); - - public static final int MAX_LINKS = 100; - - public static void main(String[] args) throws Exception { - - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - AdjacencyListBuilderJob.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json"))); - parser.parseArgument(args); - - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - - String inputPath = parser.get("inputPath"); - log.info("inputPath: {}", inputPath); - - String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); - - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ProvisionModelSupport.getModelClasses()); - - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, outputPath); - createAdjacencyListsKryo(spark, inputPath, outputPath); - }); - } - - private static void createAdjacencyListsKryo( - SparkSession spark, String inputPath, String outputPath) { - - log.info("Reading joined entities from: {}", inputPath); - - final List paths = HdfsSupport - .listFiles(inputPath, spark.sparkContext().hadoopConfiguration()); - - log.info("Found paths: {}", String.join(",", paths)); - - } - - private static Seq toSeq(List list) { - return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq(); - } - - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } -} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java index 86d380409..dd251ec04 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java @@ -92,7 +92,6 @@ public class CreateRelatedEntitiesJob_phase1 { String outputPath) { Dataset> relsByTarget = readPathRelation(spark, inputRelationsPath) - .filter("dataInfo.deletedbyinference == false") .map( (MapFunction>) r -> new Tuple2<>(r.getTarget(), r), diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java index 170835fdb..9cdf1cd2e 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java @@ -105,7 +105,7 @@ public class CreateRelatedEntitiesJob_phase2 { TypedColumn aggregator = new AdjacencyListAggregator().toColumn(); entities - .joinWith(relatedEntities, entities.col("_1").equalTo(relatedEntities.col("_1")), "left_outer") + .joinWith(relatedEntities, entities.col("_1").equalTo(relatedEntities.col("_1")), "left") .map((MapFunction, Tuple2>, JoinedEntity>) value -> { JoinedEntity je = new JoinedEntity(value._1()._2()); Optional @@ -114,7 +114,6 @@ public class CreateRelatedEntitiesJob_phase2 { .ifPresent(r -> je.getLinks().add(r)); return je; }, Encoders.kryo(JoinedEntity.class)) - .filter(filterEmptyEntityFn()) .groupByKey( (MapFunction) value -> value.getEntity().getId(), Encoders.STRING()) @@ -122,7 +121,6 @@ public class CreateRelatedEntitiesJob_phase2 { .map( (MapFunction, JoinedEntity>) value -> value._2(), Encoders.kryo(JoinedEntity.class)) - .filter(filterEmptyEntityFn()) .write() .mode(SaveMode.Overwrite) .parquet(outputPath); diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java index 20d27f0f3..c87f0cd94 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java @@ -3,8 +3,10 @@ package eu.dnetlib.dhp.oa.provision; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import java.util.*; -import java.util.function.Supplier; +import java.util.HashSet; +import java.util.Optional; +import java.util.PriorityQueue; +import java.util.Set; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; @@ -15,8 +17,10 @@ import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.rdd.RDD; -import org.apache.spark.sql.*; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.expressions.Aggregator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -24,7 +28,6 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Splitter; import com.google.common.collect.Iterables; -import com.google.common.collect.Maps; import com.google.common.collect.Sets; import eu.dnetlib.dhp.application.ArgumentApplicationParser; From 13e36a4da0297cc6186e9e014cdec7c0e59a284d Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 13 Nov 2020 10:05:02 +0100 Subject: [PATCH 10/17] WIP: added oozie workflow for grouping graph entities by id --- .../dhp/schema/oaf/OafMapperUtils.java | 2 + .../dhp/schema/oaf/ResultTypeComparator.java | 2 + .../groupbyid/DispatchEntitiesSparkJob.java | 87 ++++++ .../GroupEntitiesAndRelationsSparkJob.java | 186 ++++++++++++ ...kJob.java => MergeGraphTableSparkJob.java} | 4 +- .../raw/GenerateEntitiesApplication.java | 31 +- .../oa/graph/raw/common/OafMapperUtils.java | 273 ------------------ .../dispatch_entities_bytype_parameters.json | 20 ++ .../group_graph_entities_parameters.json | 20 ++ .../groupbyid/oozie_app/config-default.xml | 18 ++ .../oa/graph/groupbyid/oozie_app/workflow.xml | 271 +++++++++++++++++ .../dhp/oa/graph/merge/oozie_app/workflow.xml | 16 +- ....java => MergeGraphTableSparkJobTest.java} | 14 +- .../raw/GenerateEntitiesApplicationTest.java | 99 +++++++ 14 files changed, 727 insertions(+), 316 deletions(-) create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/OafMapperUtils.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ResultTypeComparator.java create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/groupbyid/DispatchEntitiesSparkJob.java create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/groupbyid/GroupEntitiesAndRelationsSparkJob.java rename dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/{MergeGraphSparkJob.java => MergeGraphTableSparkJob.java} (98%) delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dispatch_entities_bytype_parameters.json create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/group_graph_entities_parameters.json create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/groupbyid/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/groupbyid/oozie_app/workflow.xml rename dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/merge/{MergeGraphSparkJobTest.java => MergeGraphTableSparkJobTest.java} (90%) create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplicationTest.java diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/OafMapperUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/OafMapperUtils.java new file mode 100644 index 000000000..6ba918069 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/OafMapperUtils.java @@ -0,0 +1,2 @@ +package eu.dnetlib.dhp.schema.oaf;public class OafMapperUtils { +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ResultTypeComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ResultTypeComparator.java new file mode 100644 index 000000000..a2e64b83b --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ResultTypeComparator.java @@ -0,0 +1,2 @@ +package eu.dnetlib.dhp.schema.oaf;public class ResultTypeComparator { +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/groupbyid/DispatchEntitiesSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/groupbyid/DispatchEntitiesSparkJob.java new file mode 100644 index 000000000..6b90e9971 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/groupbyid/DispatchEntitiesSparkJob.java @@ -0,0 +1,87 @@ + +package eu.dnetlib.dhp.oa.graph.fuse; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Optional; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +public class DispatchEntitiesSparkJob { + + private static final Logger log = LoggerFactory.getLogger(DispatchEntitiesSparkJob.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + MigrateMongoMdstoresApplication.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/dispatch_entities_bytype_parameters.json"))); + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String sourcePath = parser.get("sourcePath"); + final String targetPath = parser.get("graphRawPath"); + + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, targetPath); + ModelSupport.oafTypes + .values() + .forEach(clazz -> processEntity(spark, clazz, sourcePath, targetPath)); + }); + } + + private static void processEntity( + final SparkSession spark, + final Class clazz, + final String sourcePath, + final String targetPath) { + final String type = clazz.getSimpleName().toLowerCase(); + + log.info("Processing entities ({}) in file: {}", type, sourcePath); + + spark + .read() + .textFile(sourcePath) + .filter((FilterFunction) value -> isEntityType(value, type)) + .map( + (MapFunction) l -> StringUtils.substringAfter(l, "|"), + Encoders.STRING()) + .write() + .option("compression", "gzip") + .mode(SaveMode.Overwrite) + .text(targetPath + "/" + type); + } + + private static boolean isEntityType(final String line, final String type) { + return StringUtils.substringBefore(line, "|").equalsIgnoreCase(type); + } + + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/groupbyid/GroupEntitiesAndRelationsSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/groupbyid/GroupEntitiesAndRelationsSparkJob.java new file mode 100644 index 000000000..7ab8646ec --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/groupbyid/GroupEntitiesAndRelationsSparkJob.java @@ -0,0 +1,186 @@ + +package eu.dnetlib.dhp.oa.graph.groupbyid; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.utils.DHPUtils.toSeq; + +import java.io.IOException; +import java.util.List; +import java.util.Objects; +import java.util.Optional; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.*; +import org.apache.spark.sql.expressions.Aggregator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.jayway.jsonpath.JsonPath; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; +import scala.Tuple2; + +/** + * Groups the graph content by entity identifier to ensure ID uniqueness + */ +public class GroupEntitiesSparkJob { + + private static final Logger log = LoggerFactory.getLogger(GroupEntitiesSparkJob.class); + + private final static String ID_JPATH = "$.id"; + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + GroupEntitiesSparkJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/group_graph_entities_parameters.json")); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String graphInputPath = parser.get("graphInputPath"); + log.info("graphInputPath: {}", graphInputPath); + + String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration()); + groupEntities(spark, graphInputPath, outputPath); + }); + } + + private static void groupEntities( + SparkSession spark, + String inputPath, + String outputPath) { + + TypedColumn aggregator = new GroupingAggregator().toColumn(); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + spark + .read() + .textFile(toSeq(listEntityPaths(inputPath, sc))) + .map((MapFunction) s -> parseEntity(s), Encoders.kryo(Oaf.class)) + .groupByKey((MapFunction) oaf -> ModelSupport.idFn().apply(oaf), Encoders.STRING()) + .agg(aggregator) + .map((MapFunction, Oaf>) Tuple2::_2, Encoders.kryo(Oaf.class)) + .write() + .mode(SaveMode.Overwrite) + .save(outputPath); + } + + public static class GroupingAggregator extends Aggregator { + + @Override + public Oaf zero() { + return null; + } + + @Override + public Oaf reduce(Oaf b, Oaf a) { + return mergeAndGet(b, a); + } + + private Oaf mergeAndGet(Oaf b, Oaf a) { + if (Objects.nonNull(a) && Objects.nonNull(b)) { + return OafMapperUtils.merge(b, a); + } + return Objects.isNull(a) ? b : a; + } + + @Override + public Oaf merge(Oaf b, Oaf a) { + return mergeAndGet(b, a); + } + + @Override + public Oaf finish(Oaf j) { + return j; + } + + @Override + public Encoder bufferEncoder() { + return Encoders.kryo(Oaf.class); + } + + @Override + public Encoder outputEncoder() { + return Encoders.kryo(Oaf.class); + } + + } + + private static Oaf parseEntity(String s) { + String prefix = StringUtils.substringBefore(jPath(ID_JPATH, s), "|"); + try { + switch (prefix) { + case "10": + return OBJECT_MAPPER.readValue(s, Datasource.class); + case "20": + return OBJECT_MAPPER.readValue(s, Organization.class); + case "40": + return OBJECT_MAPPER.readValue(s, Project.class); + case "50": + String resultType = jPath("$.resulttype.classid", s); + switch (resultType) { + case "publication": + return OBJECT_MAPPER.readValue(s, Publication.class); + case "dataset": + return OBJECT_MAPPER.readValue(s, eu.dnetlib.dhp.schema.oaf.Dataset.class); + case "software": + return OBJECT_MAPPER.readValue(s, Software.class); + case "other": + return OBJECT_MAPPER.readValue(s, OtherResearchProduct.class); + default: + throw new IllegalArgumentException(String.format("invalid resultType: '%s'", resultType)); + } + default: + throw new IllegalArgumentException(String.format("invalid id prefix: '%s'", prefix)); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private static List listEntityPaths(String inputPath, JavaSparkContext sc) { + return HdfsSupport + .listFiles(inputPath, sc.hadoopConfiguration()) + .stream() + .filter(p -> !p.contains("relation")) + .collect(Collectors.toList()); + } + + private static String jPath(final String path, final String json) { + Object o = JsonPath.read(json, path); + if (o instanceof String) + return (String) o; + throw new IllegalStateException(String.format("could not extract '%s' from:\n%s", path, json)); + } + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphTableSparkJob.java similarity index 98% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphTableSparkJob.java index 037683604..e53f4ca30 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphTableSparkJob.java @@ -33,9 +33,9 @@ import scala.Tuple2; * are picked preferring those from the BETA aggregator rather then from PROD. The identity of a relationship is defined * by eu.dnetlib.dhp.schema.common.ModelSupport#idFn() */ -public class MergeGraphSparkJob { +public class MergeGraphTableSparkJob { - private static final Logger log = LoggerFactory.getLogger(CleanGraphSparkJob.class); + private static final Logger log = LoggerFactory.getLogger(MergeGraphTableSparkJob.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java index 3568dc52a..99aa9c3f0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java @@ -4,11 +4,9 @@ package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.IOException; -import java.util.Arrays; -import java.util.List; -import java.util.Objects; -import java.util.Optional; +import java.util.*; import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -20,6 +18,7 @@ import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; +import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -29,16 +28,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.Dataset; -import eu.dnetlib.dhp.schema.oaf.Datasource; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.OafEntity; -import eu.dnetlib.dhp.schema.oaf.Organization; -import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; -import eu.dnetlib.dhp.schema.oaf.Project; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.oaf.Software; +import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import scala.Tuple2; @@ -113,7 +103,7 @@ public class GenerateEntitiesApplication { inputRdd .mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf)) - .reduceByKey((o1, o2) -> merge(o1, o2)) + .reduceByKey((o1, o2) -> OafMapperUtils.merge(o1, o2)) .map(Tuple2::_2) .map( oaf -> oaf.getClass().getSimpleName().toLowerCase() @@ -122,17 +112,6 @@ public class GenerateEntitiesApplication { .saveAsTextFile(targetPath, GzipCodec.class); } - private static Oaf merge(final Oaf o1, final Oaf o2) { - if (ModelSupport.isSubClass(o1, OafEntity.class)) { - ((OafEntity) o1).mergeFrom((OafEntity) o2); - } else if (ModelSupport.isSubClass(o1, Relation.class)) { - ((Relation) o1).mergeFrom((Relation) o2); - } else { - throw new RuntimeException("invalid Oaf type:" + o1.getClass().getCanonicalName()); - } - return o1; - } - private static List convertToListOaf( final String id, final String s, diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java deleted file mode 100644 index 84b29e3d4..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java +++ /dev/null @@ -1,273 +0,0 @@ - -package eu.dnetlib.dhp.oa.graph.raw.common; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.concurrent.ConcurrentHashMap; -import java.util.function.Function; -import java.util.function.Predicate; -import java.util.stream.Collectors; - -import org.apache.commons.lang3.StringUtils; - -import eu.dnetlib.dhp.schema.oaf.DataInfo; -import eu.dnetlib.dhp.schema.oaf.ExtraInfo; -import eu.dnetlib.dhp.schema.oaf.Field; -import eu.dnetlib.dhp.schema.oaf.Journal; -import eu.dnetlib.dhp.schema.oaf.KeyValue; -import eu.dnetlib.dhp.schema.oaf.OAIProvenance; -import eu.dnetlib.dhp.schema.oaf.OriginDescription; -import eu.dnetlib.dhp.schema.oaf.Qualifier; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import eu.dnetlib.dhp.utils.DHPUtils; - -public class OafMapperUtils { - - public static KeyValue keyValue(final String k, final String v) { - final KeyValue kv = new KeyValue(); - kv.setKey(k); - kv.setValue(v); - return kv; - } - - public static List listKeyValues(final String... s) { - if (s.length % 2 > 0) { - throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)"); - } - - final List list = new ArrayList<>(); - for (int i = 0; i < s.length; i += 2) { - list.add(keyValue(s[i], s[i + 1])); - } - return list; - } - - public static Field field(final T value, final DataInfo info) { - if (value == null || StringUtils.isBlank(value.toString())) { - return null; - } - - final Field field = new Field<>(); - field.setValue(value); - field.setDataInfo(info); - return field; - } - - public static List> listFields(final DataInfo info, final String... values) { - return Arrays - .stream(values) - .map(v -> field(v, info)) - .filter(Objects::nonNull) - .filter(distinctByKey(f -> f.getValue())) - .collect(Collectors.toList()); - } - - public static List> listFields(final DataInfo info, final List values) { - return values - .stream() - .map(v -> field(v, info)) - .filter(Objects::nonNull) - .filter(distinctByKey(f -> f.getValue())) - .collect(Collectors.toList()); - } - - public static Qualifier unknown(final String schemeid, final String schemename) { - return qualifier("UNKNOWN", "Unknown", schemeid, schemename); - } - - public static Qualifier qualifier( - final String classid, - final String classname, - final String schemeid, - final String schemename) { - final Qualifier q = new Qualifier(); - q.setClassid(classid); - q.setClassname(classname); - q.setSchemeid(schemeid); - q.setSchemename(schemename); - return q; - } - - public static StructuredProperty structuredProperty( - final String value, - final String classid, - final String classname, - final String schemeid, - final String schemename, - final DataInfo dataInfo) { - - return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo); - } - - public static StructuredProperty structuredProperty( - final String value, - final Qualifier qualifier, - final DataInfo dataInfo) { - if (value == null) { - return null; - } - final StructuredProperty sp = new StructuredProperty(); - sp.setValue(value); - sp.setQualifier(qualifier); - sp.setDataInfo(dataInfo); - return sp; - } - - public static ExtraInfo extraInfo( - final String name, - final String value, - final String typology, - final String provenance, - final String trust) { - final ExtraInfo info = new ExtraInfo(); - info.setName(name); - info.setValue(value); - info.setTypology(typology); - info.setProvenance(provenance); - info.setTrust(trust); - return info; - } - - public static OAIProvenance oaiIProvenance( - final String identifier, - final String baseURL, - final String metadataNamespace, - final Boolean altered, - final String datestamp, - final String harvestDate) { - - final OriginDescription desc = new OriginDescription(); - desc.setIdentifier(identifier); - desc.setBaseURL(baseURL); - desc.setMetadataNamespace(metadataNamespace); - desc.setAltered(altered); - desc.setDatestamp(datestamp); - desc.setHarvestDate(harvestDate); - - final OAIProvenance p = new OAIProvenance(); - p.setOriginDescription(desc); - - return p; - } - - public static Journal journal( - final String name, - final String issnPrinted, - final String issnOnline, - final String issnLinking, - final DataInfo dataInfo) { - return journal( - name, - issnPrinted, - issnOnline, - issnLinking, - null, - null, - null, - null, - null, - null, - null, - dataInfo); - } - - public static Journal journal( - final String name, - final String issnPrinted, - final String issnOnline, - final String issnLinking, - final String ep, - final String iss, - final String sp, - final String vol, - final String edition, - final String conferenceplace, - final String conferencedate, - final DataInfo dataInfo) { - - if (StringUtils.isNotBlank(name) - || StringUtils.isNotBlank(issnPrinted) - || StringUtils.isNotBlank(issnOnline) - || StringUtils.isNotBlank(issnLinking)) { - final Journal j = new Journal(); - j.setName(name); - j.setIssnPrinted(issnPrinted); - j.setIssnOnline(issnOnline); - j.setIssnLinking(issnLinking); - j.setEp(ep); - j.setIss(iss); - j.setSp(sp); - j.setVol(vol); - j.setEdition(edition); - j.setConferenceplace(conferenceplace); - j.setConferencedate(conferencedate); - j.setDataInfo(dataInfo); - return j; - } else { - return null; - } - } - - public static DataInfo dataInfo( - final Boolean deletedbyinference, - final String inferenceprovenance, - final Boolean inferred, - final Boolean invisible, - final Qualifier provenanceaction, - final String trust) { - final DataInfo d = new DataInfo(); - d.setDeletedbyinference(deletedbyinference); - d.setInferenceprovenance(inferenceprovenance); - d.setInferred(inferred); - d.setInvisible(invisible); - d.setProvenanceaction(provenanceaction); - d.setTrust(trust); - return d; - } - - public static String createOpenaireId( - final int prefix, - final String originalId, - final boolean to_md5) { - if (StringUtils.isBlank(originalId)) { - return null; - } else if (to_md5) { - final String nsPrefix = StringUtils.substringBefore(originalId, "::"); - final String rest = StringUtils.substringAfter(originalId, "::"); - return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest)); - } else { - return String.format("%s|%s", prefix, originalId); - } - } - - public static String createOpenaireId( - final String type, - final String originalId, - final boolean to_md5) { - switch (type) { - case "datasource": - return createOpenaireId(10, originalId, to_md5); - case "organization": - return createOpenaireId(20, originalId, to_md5); - case "person": - return createOpenaireId(30, originalId, to_md5); - case "project": - return createOpenaireId(40, originalId, to_md5); - default: - return createOpenaireId(50, originalId, to_md5); - } - } - - public static String asString(final Object o) { - return o == null ? "" : o.toString(); - } - - public static Predicate distinctByKey( - final Function keyExtractor) { - final Map seen = new ConcurrentHashMap<>(); - return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null; - } -} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dispatch_entities_bytype_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dispatch_entities_bytype_parameters.json new file mode 100644 index 000000000..7d995f39a --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dispatch_entities_bytype_parameters.json @@ -0,0 +1,20 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "s", + "paramLongName": "sourcePath", + "paramDescription": "the source path", + "paramRequired": true + }, + { + "paramName": "g", + "paramLongName": "graphRawPath", + "paramDescription": "the path of the graph Raw in hdfs", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/group_graph_entities_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/group_graph_entities_parameters.json new file mode 100644 index 000000000..e65acb3c4 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/group_graph_entities_parameters.json @@ -0,0 +1,20 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "gin", + "paramLongName": "graphInputPath", + "paramDescription": "the graph root path", + "paramRequired": true + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the output merged graph root path", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/groupbyid/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/groupbyid/oozie_app/config-default.xml new file mode 100644 index 000000000..2e0ed9aee --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/groupbyid/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/groupbyid/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/groupbyid/oozie_app/workflow.xml new file mode 100644 index 000000000..3715d097d --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/groupbyid/oozie_app/workflow.xml @@ -0,0 +1,271 @@ + + + + + graphInputPath + the graph root input path + + + outputPath + the graph root output path + + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + yarn + cluster + Fuse graph entities by ID + eu.dnetlib.dhp.oa.graph.fuse.FuseGraphResultsSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --graphInputPath${graphInputPath} + --outputPath${workingDir}/entities + + + + + + + + yarn + cluster + Merge datasets + eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --betaInputPath${betaInputGgraphPath}/dataset + --prodInputPath${prodInputGgraphPath}/dataset + --outputPath${graphOutputPath}/dataset + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset + --priority${priority} + + + + + + + + yarn + cluster + Merge otherresearchproducts + eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --betaInputPath${betaInputGgraphPath}/otherresearchproduct + --prodInputPath${prodInputGgraphPath}/otherresearchproduct + --outputPath${graphOutputPath}/otherresearchproduct + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --priority${priority} + + + + + + + + yarn + cluster + Merge softwares + eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --betaInputPath${betaInputGgraphPath}/software + --prodInputPath${prodInputGgraphPath}/software + --outputPath${graphOutputPath}/software + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software + --priority${priority} + + + + + + + + yarn + cluster + Merge datasources + eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --betaInputPath${betaInputGgraphPath}/datasource + --prodInputPath${prodInputGgraphPath}/datasource + --outputPath${graphOutputPath}/datasource + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Datasource + --priority${priority} + + + + + + + + yarn + cluster + Merge organizations + eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --betaInputPath${betaInputGgraphPath}/organization + --prodInputPath${prodInputGgraphPath}/organization + --outputPath${graphOutputPath}/organization + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Organization + --priority${priority} + + + + + + + + yarn + cluster + Merge projects + eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --betaInputPath${betaInputGgraphPath}/project + --prodInputPath${prodInputGgraphPath}/project + --outputPath${graphOutputPath}/project + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Project + --priority${priority} + + + + + + + + yarn + cluster + Merge relations + eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --betaInputPath${betaInputGgraphPath}/relation + --prodInputPath${prodInputGgraphPath}/relation + --outputPath${graphOutputPath}/relation + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Relation + --priority${priority} + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml index 07a125fb6..604f515a5 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml @@ -76,7 +76,7 @@ yarn cluster Merge publications - eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob + eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob dhp-graph-mapper-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -103,7 +103,7 @@ yarn cluster Merge datasets - eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob + eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob dhp-graph-mapper-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -130,7 +130,7 @@ yarn cluster Merge otherresearchproducts - eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob + eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob dhp-graph-mapper-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -157,7 +157,7 @@ yarn cluster Merge softwares - eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob + eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob dhp-graph-mapper-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -184,7 +184,7 @@ yarn cluster Merge datasources - eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob + eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob dhp-graph-mapper-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -211,7 +211,7 @@ yarn cluster Merge organizations - eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob + eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob dhp-graph-mapper-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -238,7 +238,7 @@ yarn cluster Merge projects - eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob + eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob dhp-graph-mapper-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -265,7 +265,7 @@ yarn cluster Merge relations - eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob + eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob dhp-graph-mapper-${projectVersion}.jar --executor-cores=${sparkExecutorCores} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJobTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphTableSparkJobTest.java similarity index 90% rename from dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJobTest.java rename to dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphTableSparkJobTest.java index 28e8e5abc..0089811cf 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJobTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphTableSparkJobTest.java @@ -15,7 +15,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.schema.oaf.Datasource; -public class MergeGraphSparkJobTest { +public class MergeGraphTableSparkJobTest { private ObjectMapper mapper; @@ -28,7 +28,7 @@ public class MergeGraphSparkJobTest { public void testMergeDatasources() throws IOException { assertEquals( "openaire-cris_1.1", - MergeGraphSparkJob + MergeGraphTableSparkJob .mergeDatasource( d("datasource_cris.json"), d("datasource_UNKNOWN.json")) @@ -36,7 +36,7 @@ public class MergeGraphSparkJobTest { .getClassid()); assertEquals( "openaire-cris_1.1", - MergeGraphSparkJob + MergeGraphTableSparkJob .mergeDatasource( d("datasource_UNKNOWN.json"), d("datasource_cris.json")) @@ -44,7 +44,7 @@ public class MergeGraphSparkJobTest { .getClassid()); assertEquals( "driver-openaire2.0", - MergeGraphSparkJob + MergeGraphTableSparkJob .mergeDatasource( d("datasource_native.json"), d("datasource_driver-openaire2.0.json")) @@ -52,7 +52,7 @@ public class MergeGraphSparkJobTest { .getClassid()); assertEquals( "driver-openaire2.0", - MergeGraphSparkJob + MergeGraphTableSparkJob .mergeDatasource( d("datasource_driver-openaire2.0.json"), d("datasource_native.json")) @@ -60,7 +60,7 @@ public class MergeGraphSparkJobTest { .getClassid()); assertEquals( "openaire4.0", - MergeGraphSparkJob + MergeGraphTableSparkJob .mergeDatasource( d("datasource_notCompatible.json"), d("datasource_openaire4.0.json")) @@ -68,7 +68,7 @@ public class MergeGraphSparkJobTest { .getClassid()); assertEquals( "notCompatible", - MergeGraphSparkJob + MergeGraphTableSparkJob .mergeDatasource( d("datasource_notCompatible.json"), d("datasource_UNKNOWN.json")) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplicationTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplicationTest.java new file mode 100644 index 000000000..705f1dddb --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplicationTest.java @@ -0,0 +1,99 @@ + +package eu.dnetlib.dhp.oa.graph.raw; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.lenient; + +import java.io.IOException; +import java.util.List; + +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest; +import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; + +@ExtendWith(MockitoExtension.class) +public class GenerateEntitiesApplicationTest { + + @Mock + private ISLookUpService isLookUpService; + + @Mock + private VocabularyGroup vocs; + + @BeforeEach + public void setUp() throws IOException, ISLookUpException { + + lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs()); + lenient() + .when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY)) + .thenReturn(synonyms()); + + vocs = VocabularyGroup.loadVocsFromIS(isLookUpService); + } + + @Test + public void testMergeResult() throws IOException { + Result publication = getResult("oaf_record.xml", Publication.class); + Result dataset = getResult("odf_dataset.xml", Dataset.class); + Result software = getResult("odf_software.xml", Software.class); + Result orp = getResult("oaf_orp.xml", OtherResearchProduct.class); + + verifyMerge(publication, dataset, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID); + verifyMerge(dataset, publication, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID); + + verifyMerge(publication, software, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID); + verifyMerge(software, publication, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID); + + verifyMerge(publication, orp, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID); + verifyMerge(orp, publication, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID); + + verifyMerge(dataset, software, Dataset.class, ModelConstants.DATASET_RESULTTYPE_CLASSID); + verifyMerge(software, dataset, Dataset.class, ModelConstants.DATASET_RESULTTYPE_CLASSID); + + verifyMerge(dataset, orp, Dataset.class, ModelConstants.DATASET_RESULTTYPE_CLASSID); + verifyMerge(orp, dataset, Dataset.class, ModelConstants.DATASET_RESULTTYPE_CLASSID); + + verifyMerge(software, orp, Software.class, ModelConstants.SOFTWARE_RESULTTYPE_CLASSID); + verifyMerge(orp, software, Software.class, ModelConstants.SOFTWARE_RESULTTYPE_CLASSID); + } + + protected void verifyMerge(Result publication, Result dataset, Class clazz, + String resultType) { + final Result merge = OafMapperUtils.mergeResults(publication, dataset); + assertTrue(clazz.isAssignableFrom(merge.getClass())); + assertEquals(resultType, merge.getResulttype().getClassid()); + } + + protected Result getResult(String xmlFileName, Class clazz) throws IOException { + final String xml = IOUtils.toString(getClass().getResourceAsStream(xmlFileName)); + return new OdfToOafMapper(vocs, false) + .processMdRecord(xml) + .stream() + .filter(s -> clazz.isAssignableFrom(s.getClass())) + .map(s -> (Result) s) + .findFirst() + .get(); + } + + private List vocs() throws IOException { + return IOUtils + .readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt")); + } + + private List synonyms() throws IOException { + return IOUtils + .readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt")); + } + +} From 2bed29eb09616e01ee1d26c082847d7298db9e98 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 13 Nov 2020 10:05:12 +0100 Subject: [PATCH 11/17] WIP: added oozie workflow for grouping graph entities by id --- .../dhp/schema/oaf/OafMapperUtils.java | 297 +++++++++++++++++- .../dhp/schema/oaf/ResultTypeComparator.java | 49 ++- .../java/eu/dnetlib/dhp/utils/DHPUtils.java | 7 + .../oa/graph/clean/CleanGraphSparkJob.java | 10 +- .../dhp/oa/graph/clean/CleaningFunctions.java | 1 - .../groupbyid/DispatchEntitiesSparkJob.java | 64 ++-- .../GroupEntitiesAndRelationsSparkJob.java | 80 +++-- .../raw/AbstractMdRecordToOafMapper.java | 2 +- .../raw/GenerateEntitiesApplication.java | 24 +- .../raw/MigrateDbEntitiesApplication.java | 10 +- .../dhp/oa/graph/raw/OafToOafMapper.java | 6 +- .../dhp/oa/graph/raw/OdfToOafMapper.java | 6 +- .../dhp/oa/graph/raw/common/Vocabulary.java | 1 + .../oa/graph/raw/common/VocabularyGroup.java | 1 + .../dispatch_entities_bytype_parameters.json | 16 +- .../oa/graph/groupbyid/oozie_app/workflow.xml | 188 ++++++----- .../dhp/oa/graph/merge/oozie_app/workflow.xml | 4 +- .../raw/MigrateDbEntitiesApplicationTest.java | 8 +- .../dhp/oa/provision/XmlConverterJob.java | 15 +- 19 files changed, 578 insertions(+), 211 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/OafMapperUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/OafMapperUtils.java index 6ba918069..4a66f91dc 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/OafMapperUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/OafMapperUtils.java @@ -1,2 +1,297 @@ -package eu.dnetlib.dhp.schema.oaf;public class OafMapperUtils { + +package eu.dnetlib.dhp.schema.oaf; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.ConcurrentHashMap; +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; + +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.utils.DHPUtils; + +public class OafMapperUtils { + + public static Oaf merge(final Oaf o1, final Oaf o2) { + if (ModelSupport.isSubClass(o1, OafEntity.class)) { + if (ModelSupport.isSubClass(o1, Result.class)) { + + return mergeResults((Result) o1, (Result) o2); + } else if (ModelSupport.isSubClass(o1, Datasource.class)) { + ((Datasource) o1).mergeFrom((Datasource) o2); + } else if (ModelSupport.isSubClass(o1, Organization.class)) { + ((Organization) o1).mergeFrom((Organization) o2); + } else if (ModelSupport.isSubClass(o1, Project.class)) { + ((Project) o1).mergeFrom((Project) o2); + } else { + throw new RuntimeException("invalid OafEntity subtype:" + o1.getClass().getCanonicalName()); + } + } else if (ModelSupport.isSubClass(o1, Relation.class)) { + ((Relation) o1).mergeFrom((Relation) o2); + } else { + throw new RuntimeException("invalid Oaf type:" + o1.getClass().getCanonicalName()); + } + return o1; + } + + public static Result mergeResults(Result r1, Result r2) { + if (new ResultTypeComparator().compare(r1, r2) < 0) { + r1.mergeFrom(r2); + return r1; + } else { + r2.mergeFrom(r1); + return r2; + } + } + + public static KeyValue keyValue(final String k, final String v) { + final KeyValue kv = new KeyValue(); + kv.setKey(k); + kv.setValue(v); + return kv; + } + + public static List listKeyValues(final String... s) { + if (s.length % 2 > 0) { + throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)"); + } + + final List list = new ArrayList<>(); + for (int i = 0; i < s.length; i += 2) { + list.add(keyValue(s[i], s[i + 1])); + } + return list; + } + + public static Field field(final T value, final DataInfo info) { + if (value == null || StringUtils.isBlank(value.toString())) { + return null; + } + + final Field field = new Field<>(); + field.setValue(value); + field.setDataInfo(info); + return field; + } + + public static List> listFields(final DataInfo info, final String... values) { + return Arrays + .stream(values) + .map(v -> field(v, info)) + .filter(Objects::nonNull) + .filter(distinctByKey(f -> f.getValue())) + .collect(Collectors.toList()); + } + + public static List> listFields(final DataInfo info, final List values) { + return values + .stream() + .map(v -> field(v, info)) + .filter(Objects::nonNull) + .filter(distinctByKey(f -> f.getValue())) + .collect(Collectors.toList()); + } + + public static Qualifier unknown(final String schemeid, final String schemename) { + return qualifier("UNKNOWN", "Unknown", schemeid, schemename); + } + + public static Qualifier qualifier( + final String classid, + final String classname, + final String schemeid, + final String schemename) { + final Qualifier q = new Qualifier(); + q.setClassid(classid); + q.setClassname(classname); + q.setSchemeid(schemeid); + q.setSchemename(schemename); + return q; + } + + public static StructuredProperty structuredProperty( + final String value, + final String classid, + final String classname, + final String schemeid, + final String schemename, + final DataInfo dataInfo) { + + return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo); + } + + public static StructuredProperty structuredProperty( + final String value, + final Qualifier qualifier, + final DataInfo dataInfo) { + if (value == null) { + return null; + } + final StructuredProperty sp = new StructuredProperty(); + sp.setValue(value); + sp.setQualifier(qualifier); + sp.setDataInfo(dataInfo); + return sp; + } + + public static ExtraInfo extraInfo( + final String name, + final String value, + final String typology, + final String provenance, + final String trust) { + final ExtraInfo info = new ExtraInfo(); + info.setName(name); + info.setValue(value); + info.setTypology(typology); + info.setProvenance(provenance); + info.setTrust(trust); + return info; + } + + public static OAIProvenance oaiIProvenance( + final String identifier, + final String baseURL, + final String metadataNamespace, + final Boolean altered, + final String datestamp, + final String harvestDate) { + + final OriginDescription desc = new OriginDescription(); + desc.setIdentifier(identifier); + desc.setBaseURL(baseURL); + desc.setMetadataNamespace(metadataNamespace); + desc.setAltered(altered); + desc.setDatestamp(datestamp); + desc.setHarvestDate(harvestDate); + + final OAIProvenance p = new OAIProvenance(); + p.setOriginDescription(desc); + + return p; + } + + public static Journal journal( + final String name, + final String issnPrinted, + final String issnOnline, + final String issnLinking, + final DataInfo dataInfo) { + return journal( + name, + issnPrinted, + issnOnline, + issnLinking, + null, + null, + null, + null, + null, + null, + null, + dataInfo); + } + + public static Journal journal( + final String name, + final String issnPrinted, + final String issnOnline, + final String issnLinking, + final String ep, + final String iss, + final String sp, + final String vol, + final String edition, + final String conferenceplace, + final String conferencedate, + final DataInfo dataInfo) { + + if (StringUtils.isNotBlank(name) + || StringUtils.isNotBlank(issnPrinted) + || StringUtils.isNotBlank(issnOnline) + || StringUtils.isNotBlank(issnLinking)) { + final Journal j = new Journal(); + j.setName(name); + j.setIssnPrinted(issnPrinted); + j.setIssnOnline(issnOnline); + j.setIssnLinking(issnLinking); + j.setEp(ep); + j.setIss(iss); + j.setSp(sp); + j.setVol(vol); + j.setEdition(edition); + j.setConferenceplace(conferenceplace); + j.setConferencedate(conferencedate); + j.setDataInfo(dataInfo); + return j; + } else { + return null; + } + } + + public static DataInfo dataInfo( + final Boolean deletedbyinference, + final String inferenceprovenance, + final Boolean inferred, + final Boolean invisible, + final Qualifier provenanceaction, + final String trust) { + final DataInfo d = new DataInfo(); + d.setDeletedbyinference(deletedbyinference); + d.setInferenceprovenance(inferenceprovenance); + d.setInferred(inferred); + d.setInvisible(invisible); + d.setProvenanceaction(provenanceaction); + d.setTrust(trust); + return d; + } + + public static String createOpenaireId( + final int prefix, + final String originalId, + final boolean to_md5) { + if (StringUtils.isBlank(originalId)) { + return null; + } else if (to_md5) { + final String nsPrefix = StringUtils.substringBefore(originalId, "::"); + final String rest = StringUtils.substringAfter(originalId, "::"); + return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest)); + } else { + return String.format("%s|%s", prefix, originalId); + } + } + + public static String createOpenaireId( + final String type, + final String originalId, + final boolean to_md5) { + switch (type) { + case "datasource": + return createOpenaireId(10, originalId, to_md5); + case "organization": + return createOpenaireId(20, originalId, to_md5); + case "person": + return createOpenaireId(30, originalId, to_md5); + case "project": + return createOpenaireId(40, originalId, to_md5); + default: + return createOpenaireId(50, originalId, to_md5); + } + } + + public static String asString(final Object o) { + return o == null ? "" : o.toString(); + } + + public static Predicate distinctByKey( + final Function keyExtractor) { + final Map seen = new ConcurrentHashMap<>(); + return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ResultTypeComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ResultTypeComparator.java index a2e64b83b..6c11d1a85 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ResultTypeComparator.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ResultTypeComparator.java @@ -1,2 +1,49 @@ -package eu.dnetlib.dhp.schema.oaf;public class ResultTypeComparator { + +package eu.dnetlib.dhp.schema.oaf; + +import java.util.Comparator; + +import eu.dnetlib.dhp.schema.common.ModelConstants; + +public class ResultTypeComparator implements Comparator { + + @Override + public int compare(Result left, Result right) { + + if (left == null && right == null) + return 0; + if (left == null) + return 1; + if (right == null) + return -1; + + String lClass = left.getResulttype().getClassid(); + String rClass = right.getResulttype().getClassid(); + + if (lClass.equals(rClass)) + return 0; + + if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID)) + return -1; + if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID)) + return 1; + + if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID)) + return -1; + if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID)) + return 1; + + if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID)) + return -1; + if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID)) + return 1; + + if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID)) + return -1; + if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID)) + return 1; + + // Else (but unlikely), lexicographical ordering will do. + return lClass.compareTo(rClass); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java index dfbaf3a6c..8872174a5 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java @@ -5,6 +5,7 @@ import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.nio.charset.StandardCharsets; import java.security.MessageDigest; +import java.util.List; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; @@ -15,9 +16,15 @@ import org.apache.commons.codec.binary.Hex; import com.jayway.jsonpath.JsonPath; import net.minidev.json.JSONArray; +import scala.collection.JavaConverters; +import scala.collection.Seq; public class DHPUtils { + public static Seq toSeq(List list) { + return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq(); + } + public static String md5(final String s) { try { final MessageDigest md = MessageDigest.getInstance("MD5"); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java index e295b9503..714b35dac 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java @@ -3,13 +3,9 @@ package eu.dnetlib.dhp.oa.graph.clean; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import java.io.BufferedInputStream; -import java.util.Objects; import java.util.Optional; -import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; @@ -23,11 +19,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper; -import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java index 56a4aaf5a..9f06ea056 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java @@ -11,7 +11,6 @@ import org.apache.commons.lang3.StringUtils; import com.clearspring.analytics.util.Lists; import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper; -import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/groupbyid/DispatchEntitiesSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/groupbyid/DispatchEntitiesSparkJob.java index 6b90e9971..1b4226411 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/groupbyid/DispatchEntitiesSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/groupbyid/DispatchEntitiesSparkJob.java @@ -1,11 +1,10 @@ -package eu.dnetlib.dhp.oa.graph.fuse; +package eu.dnetlib.dhp.oa.graph.groupbyid; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.Oaf; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; @@ -17,14 +16,22 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.Optional; +import com.fasterxml.jackson.databind.ObjectMapper; -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import scala.Tuple2; public class DispatchEntitiesSparkJob { private static final Logger log = LoggerFactory.getLogger(DispatchEntitiesSparkJob.class); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils @@ -40,48 +47,51 @@ public class DispatchEntitiesSparkJob { .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String sourcePath = parser.get("sourcePath"); - final String targetPath = parser.get("graphRawPath"); + final String entitiesPath = parser.get("entitiesPath"); + log.info("entitiesPath: {}", entitiesPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + String graphTableClassName = parser.get("graphTableClassName"); + log.info("graphTableClassName: {}", graphTableClassName); + + Class entityClazz = (Class) Class.forName(graphTableClassName); SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + runWithSparkSession( conf, isSparkSessionManaged, spark -> { - removeOutputDir(spark, targetPath); - ModelSupport.oafTypes - .values() - .forEach(clazz -> processEntity(spark, clazz, sourcePath, targetPath)); + HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration()); + dispatchOaf(spark, entityClazz, entitiesPath, outputPath); }); } - private static void processEntity( + private static void dispatchOaf( final SparkSession spark, final Class clazz, final String sourcePath, final String targetPath) { - final String type = clazz.getSimpleName().toLowerCase(); - log.info("Processing entities ({}) in file: {}", type, sourcePath); + log.info("Processing entities ({}) in file: {}", clazz.getName(), sourcePath); spark .read() .textFile(sourcePath) - .filter((FilterFunction) value -> isEntityType(value, type)) - .map( - (MapFunction) l -> StringUtils.substringAfter(l, "|"), - Encoders.STRING()) + .filter((FilterFunction) s -> isEntityType(s, clazz)) + .map((MapFunction) s -> StringUtils.substringAfter(s, "|"), Encoders.STRING()) .write() .option("compression", "gzip") .mode(SaveMode.Overwrite) - .text(targetPath + "/" + type); + .text(targetPath); } - private static boolean isEntityType(final String line, final String type) { - return StringUtils.substringBefore(line, "|").equalsIgnoreCase(type); + private static boolean isEntityType(final String s, final Class clazz) { + return StringUtils.substringBefore(s, "|").equals(clazz.getName()); } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/groupbyid/GroupEntitiesAndRelationsSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/groupbyid/GroupEntitiesAndRelationsSparkJob.java index 7ab8646ec..1d887cecb 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/groupbyid/GroupEntitiesAndRelationsSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/groupbyid/GroupEntitiesAndRelationsSparkJob.java @@ -12,9 +12,9 @@ import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.*; import org.apache.spark.sql.expressions.Aggregator; @@ -22,7 +22,10 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; +import com.jayway.jsonpath.Configuration; +import com.jayway.jsonpath.DocumentContext; import com.jayway.jsonpath.JsonPath; +import com.jayway.jsonpath.Option; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; @@ -33,19 +36,21 @@ import scala.Tuple2; /** * Groups the graph content by entity identifier to ensure ID uniqueness */ -public class GroupEntitiesSparkJob { +public class GroupEntitiesAndRelationsSparkJob { - private static final Logger log = LoggerFactory.getLogger(GroupEntitiesSparkJob.class); + private static final Logger log = LoggerFactory.getLogger(GroupEntitiesAndRelationsSparkJob.class); private final static String ID_JPATH = "$.id"; + private final static String SOURCE_JPATH = "$.source"; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils .toString( - GroupEntitiesSparkJob.class + GroupEntitiesAndRelationsSparkJob.class .getResourceAsStream( "/eu/dnetlib/dhp/oa/graph/group_graph_entities_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); @@ -72,11 +77,11 @@ public class GroupEntitiesSparkJob { isSparkSessionManaged, spark -> { HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration()); - groupEntities(spark, graphInputPath, outputPath); + groupEntitiesAndRelations(spark, graphInputPath, outputPath); }); } - private static void groupEntities( + private static void groupEntitiesAndRelations( SparkSession spark, String inputPath, String outputPath) { @@ -85,14 +90,19 @@ public class GroupEntitiesSparkJob { final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); spark .read() - .textFile(toSeq(listEntityPaths(inputPath, sc))) - .map((MapFunction) s -> parseEntity(s), Encoders.kryo(Oaf.class)) + .textFile(toSeq(listPaths(inputPath, sc))) + .map((MapFunction) s -> parseOaf(s), Encoders.kryo(Oaf.class)) + .filter((FilterFunction) oaf -> StringUtils.isNotBlank(ModelSupport.idFn().apply(oaf))) .groupByKey((MapFunction) oaf -> ModelSupport.idFn().apply(oaf), Encoders.STRING()) .agg(aggregator) - .map((MapFunction, Oaf>) Tuple2::_2, Encoders.kryo(Oaf.class)) + .map( + (MapFunction, String>) t -> t._2().getClass().getName() + + "|" + OBJECT_MAPPER.writeValueAsString(t._2()), + Encoders.STRING()) .write() + .option("compression", "gzip") .mode(SaveMode.Overwrite) - .save(outputPath); + .text(outputPath); } public static class GroupingAggregator extends Aggregator { @@ -136,51 +146,61 @@ public class GroupEntitiesSparkJob { } - private static Oaf parseEntity(String s) { - String prefix = StringUtils.substringBefore(jPath(ID_JPATH, s), "|"); - try { + private static Oaf parseOaf(String s) { + + DocumentContext dc = JsonPath + .parse(s, Configuration.defaultConfiguration().addOptions(Option.SUPPRESS_EXCEPTIONS)); + final String id = dc.read(ID_JPATH); + if (StringUtils.isNotBlank(id)) { + + String prefix = StringUtils.substringBefore(id, "|"); switch (prefix) { case "10": - return OBJECT_MAPPER.readValue(s, Datasource.class); + return parse(s, Datasource.class); case "20": - return OBJECT_MAPPER.readValue(s, Organization.class); + return parse(s, Organization.class); case "40": - return OBJECT_MAPPER.readValue(s, Project.class); + return parse(s, Project.class); case "50": - String resultType = jPath("$.resulttype.classid", s); + String resultType = dc.read("$.resulttype.classid"); switch (resultType) { case "publication": - return OBJECT_MAPPER.readValue(s, Publication.class); + return parse(s, Publication.class); case "dataset": - return OBJECT_MAPPER.readValue(s, eu.dnetlib.dhp.schema.oaf.Dataset.class); + return parse(s, eu.dnetlib.dhp.schema.oaf.Dataset.class); case "software": - return OBJECT_MAPPER.readValue(s, Software.class); + return parse(s, Software.class); case "other": - return OBJECT_MAPPER.readValue(s, OtherResearchProduct.class); + return parse(s, OtherResearchProduct.class); default: throw new IllegalArgumentException(String.format("invalid resultType: '%s'", resultType)); } default: throw new IllegalArgumentException(String.format("invalid id prefix: '%s'", prefix)); } + } else { + String source = dc.read(SOURCE_JPATH); + if (StringUtils.isNotBlank(source)) { + return parse(s, Relation.class); + } else { + throw new IllegalArgumentException(String.format("invalid oaf: '%s'", s)); + } + } + } + + private static Oaf parse(String s, Class clazz) { + try { + return OBJECT_MAPPER.readValue(s, clazz); } catch (IOException e) { throw new RuntimeException(e); } } - private static List listEntityPaths(String inputPath, JavaSparkContext sc) { + private static List listPaths(String inputPath, JavaSparkContext sc) { return HdfsSupport .listFiles(inputPath, sc.hadoopConfiguration()) .stream() - .filter(p -> !p.contains("relation")) .collect(Collectors.toList()); } - private static String jPath(final String path, final String json) { - Object o = JsonPath.read(json, path); - if (o instanceof String) - return (String) o; - throw new IllegalStateException(String.format("could not extract '%s' from:\n%s", path, json)); - } - } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 5b6ae72f1..da4b5e324 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -1,8 +1,8 @@ package eu.dnetlib.dhp.oa.graph.raw; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*; import static eu.dnetlib.dhp.schema.common.ModelConstants.*; +import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*; import java.util.*; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java index 99aa9c3f0..cfd190670 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java @@ -4,9 +4,11 @@ package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.IOException; -import java.util.*; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; +import java.util.Optional; import java.util.stream.Collectors; -import java.util.stream.Stream; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -18,7 +20,6 @@ import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; -import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -68,7 +69,7 @@ public class GenerateEntitiesApplication { final SparkConf conf = new SparkConf(); runWithSparkSession(conf, isSparkSessionManaged, spark -> { - removeOutputDir(spark, targetPath); + HdfsSupport.remove(targetPath, spark.sparkContext().hadoopConfiguration()); generateEntities(spark, vocs, sourcePaths, targetPath); }); } @@ -82,7 +83,7 @@ public class GenerateEntitiesApplication { final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final List existingSourcePaths = Arrays .stream(sourcePaths.split(",")) - .filter(p -> exists(sc, p)) + .filter(p -> HdfsSupport.exists(p, sc.hadoopConfiguration())) .collect(Collectors.toList()); log.info("Generate entities from files:"); @@ -160,17 +161,4 @@ public class GenerateEntitiesApplication { } } - private static boolean exists(final JavaSparkContext context, final String pathToFile) { - try { - final FileSystem hdfs = FileSystem.get(context.hadoopConfiguration()); - final Path path = new Path(pathToFile); - return hdfs.exists(path); - } catch (final IOException e) { - throw new RuntimeException(e); - } - } - - private static void removeOutputDir(final SparkSession spark, final String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index 6365a1db9..f6f0e0a36 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -1,15 +1,6 @@ package eu.dnetlib.dhp.oa.graph.raw; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.asString; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.dataInfo; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.journal; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listFields; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listKeyValues; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE; import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASOURCE_ORGANIZATION; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTIONS; @@ -32,6 +23,7 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT; import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT; import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE; import static eu.dnetlib.dhp.schema.common.ModelConstants.USER_CLAIM; +import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*; import java.io.Closeable; import java.io.IOException; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index dea80fabd..e62bc0790 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -1,10 +1,10 @@ package eu.dnetlib.dhp.oa.graph.raw; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; import static eu.dnetlib.dhp.schema.common.ModelConstants.*; +import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId; +import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field; +import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty; import java.util.ArrayList; import java.util.List; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 6fe7bb971..7124684d5 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -1,10 +1,10 @@ package eu.dnetlib.dhp.oa.graph.raw; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; import static eu.dnetlib.dhp.schema.common.ModelConstants.*; +import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId; +import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field; +import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty; import java.util.ArrayList; import java.util.Arrays; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/Vocabulary.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/Vocabulary.java index 9bf198c8b..bfc4fd6f1 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/Vocabulary.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/Vocabulary.java @@ -10,6 +10,7 @@ import org.apache.commons.lang3.StringUtils; import com.google.common.collect.Maps; +import eu.dnetlib.dhp.schema.oaf.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.Qualifier; public class Vocabulary implements Serializable { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java index 334339d3b..32452bdc5 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java @@ -7,6 +7,7 @@ import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; +import eu.dnetlib.dhp.schema.oaf.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dispatch_entities_bytype_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dispatch_entities_bytype_parameters.json index 7d995f39a..a75ff3653 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dispatch_entities_bytype_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dispatch_entities_bytype_parameters.json @@ -6,15 +6,21 @@ "paramRequired": false }, { - "paramName": "s", - "paramLongName": "sourcePath", - "paramDescription": "the source path", + "paramName": "ep", + "paramLongName": "entitiesPath", + "paramDescription": "the entities path", "paramRequired": true }, { "paramName": "g", - "paramLongName": "graphRawPath", - "paramDescription": "the path of the graph Raw in hdfs", + "paramLongName": "outputPath", + "paramDescription": "the output path to store the dispatched entities", + "paramRequired": true + }, + { + "paramName": "class", + "paramLongName": "graphTableClassName", + "paramDescription": "class name modelling the graph table", "paramRequired": true } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/groupbyid/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/groupbyid/oozie_app/workflow.xml index 3715d097d..70b2ea68e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/groupbyid/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/groupbyid/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + @@ -46,18 +46,18 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + yarn cluster - Fuse graph entities by ID - eu.dnetlib.dhp.oa.graph.fuse.FuseGraphResultsSparkJob + group graph entities and relations + eu.dnetlib.dhp.oa.graph.groupbyid.GroupEntitiesAndRelationsSparkJob dhp-graph-mapper-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -70,18 +70,29 @@ --conf spark.sql.shuffle.partitions=7680 --graphInputPath${graphInputPath} - --outputPath${workingDir}/entities + --outputPath${workingDir}/grouped_entities - + - + + + + + + + + + + + + yarn cluster - Merge datasets - eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob + Dispatch publications + eu.dnetlib.dhp.oa.graph.groupbyid.DispatchEntitiesSparkJob dhp-graph-mapper-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -93,22 +104,45 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=7680 - --betaInputPath${betaInputGgraphPath}/dataset - --prodInputPath${prodInputGgraphPath}/dataset - --outputPath${graphOutputPath}/dataset + --entitiesPath${workingDir}/grouped_entities + --outputPath${outputPath}/publication + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication + + + + + + + + yarn + cluster + Dispatch datasets + eu.dnetlib.dhp.oa.graph.groupbyid.DispatchEntitiesSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --entitiesPath${workingDir}/grouped_entities + --outputPath${outputPath}/dataset --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset - --priority${priority} - + - + yarn cluster - Merge otherresearchproducts - eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob + Dispatch softwares + eu.dnetlib.dhp.oa.graph.groupbyid.DispatchEntitiesSparkJob dhp-graph-mapper-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -120,49 +154,20 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=7680 - --betaInputPath${betaInputGgraphPath}/otherresearchproduct - --prodInputPath${prodInputGgraphPath}/otherresearchproduct - --outputPath${graphOutputPath}/otherresearchproduct - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --priority${priority} - - - - - - - - yarn - cluster - Merge softwares - eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - - --betaInputPath${betaInputGgraphPath}/software - --prodInputPath${prodInputGgraphPath}/software - --outputPath${graphOutputPath}/software + --entitiesPath${workingDir}/grouped_entities + --outputPath${outputPath}/software --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software - --priority${priority} - + - + yarn cluster - Merge datasources - eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob + Dispatch otherresearchproducts + eu.dnetlib.dhp.oa.graph.groupbyid.DispatchEntitiesSparkJob dhp-graph-mapper-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -174,22 +179,45 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=7680 - --betaInputPath${betaInputGgraphPath}/datasource - --prodInputPath${prodInputGgraphPath}/datasource - --outputPath${graphOutputPath}/datasource + --entitiesPath${workingDir}/grouped_entities + --outputPath${outputPath}/otherresearchproduct + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + + + + + + + + yarn + cluster + Dispatch datasources + eu.dnetlib.dhp.oa.graph.groupbyid.DispatchEntitiesSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --entitiesPath${workingDir}/grouped_entities + --outputPath${outputPath}/datasource --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Datasource - --priority${priority} - + - + yarn cluster - Merge organizations - eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob + Dispatch organizations + eu.dnetlib.dhp.oa.graph.groupbyid.DispatchEntitiesSparkJob dhp-graph-mapper-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -201,22 +229,20 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=7680 - --betaInputPath${betaInputGgraphPath}/organization - --prodInputPath${prodInputGgraphPath}/organization - --outputPath${graphOutputPath}/organization + --entitiesPath${workingDir}/grouped_entities + --outputPath${outputPath}/organization --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Organization - --priority${priority} - + - + yarn cluster - Merge projects - eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob + Dispatch project + eu.dnetlib.dhp.oa.graph.groupbyid.DispatchEntitiesSparkJob dhp-graph-mapper-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -228,22 +254,20 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=7680 - --betaInputPath${betaInputGgraphPath}/project - --prodInputPath${prodInputGgraphPath}/project - --outputPath${graphOutputPath}/project + --entitiesPath${workingDir}/grouped_entities + --outputPath${outputPath}/project --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Project - --priority${priority} - + - + yarn cluster - Merge relations - eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob + Dispatch relations + eu.dnetlib.dhp.oa.graph.groupbyid.DispatchEntitiesSparkJob dhp-graph-mapper-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -255,17 +279,15 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=7680 - --betaInputPath${betaInputGgraphPath}/relation - --prodInputPath${prodInputGgraphPath}/relation - --outputPath${graphOutputPath}/relation + --entitiesPath${workingDir}/grouped_entities + --outputPath${outputPath}/relation --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Relation - --priority${priority} - + - + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml index 604f515a5..5af117d68 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml @@ -2,11 +2,11 @@ - betaInputGgraphPath + betaInputGraphPath the beta graph root path - prodInputGgraphPath + prodInputGraphPath the production graph root path diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java index f663d6095..9cf75f208 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java @@ -27,14 +27,8 @@ import org.mockito.junit.jupiter.MockitoExtension; import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; -import eu.dnetlib.dhp.schema.oaf.Datasource; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.Organization; -import eu.dnetlib.dhp.schema.oaf.Project; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.*; @ExtendWith(MockitoExtension.class) public class MigrateDbEntitiesApplicationTest { diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java index d8eba31b6..b44ed7446 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java @@ -2,12 +2,11 @@ package eu.dnetlib.dhp.oa.provision; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.utils.DHPUtils.toSeq; -import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; @@ -28,13 +27,11 @@ import com.google.common.collect.Maps; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.provision.model.*; +import eu.dnetlib.dhp.oa.provision.model.JoinedEntity; +import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport; import eu.dnetlib.dhp.oa.provision.utils.ContextMapper; import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory; -import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; -import scala.collection.JavaConverters; -import scala.collection.Seq; /** * XmlConverterJob converts the JoinedEntities as XML records @@ -43,8 +40,6 @@ public class XmlConverterJob { private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd"; public static void main(String[] args) throws Exception { @@ -129,10 +124,6 @@ public class XmlConverterJob { HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); } - private static Seq toSeq(List list) { - return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq(); - } - private static Map prepareAccumulators(SparkContext sc) { Map accumulators = Maps.newHashMap(); accumulators From 528231a287be05c251709773c62ed8386382e8a9 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 13 Nov 2020 15:37:48 +0100 Subject: [PATCH 12/17] grouping graph entities by id turned out to be an easy extension for the already existing cleaning workflow --- .../oa/graph/clean/CleanGraphSparkJob.java | 14 +- .../GroupEntitiesAndRelationsSparkJob.java | 2 +- .../groupbyid/DispatchEntitiesSparkJob.java | 97 ------ .../dhp/oa/graph/clean/oozie_app/workflow.xml | 26 +- .../groupbyid/oozie_app/config-default.xml | 18 -- .../oa/graph/groupbyid/oozie_app/workflow.xml | 293 ------------------ 6 files changed, 35 insertions(+), 415 deletions(-) rename dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/{groupbyid => clean}/GroupEntitiesAndRelationsSparkJob.java (99%) delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/groupbyid/DispatchEntitiesSparkJob.java delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/groupbyid/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/groupbyid/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java index 714b35dac..8231dd77e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java @@ -6,7 +6,9 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.util.Optional; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -69,12 +71,12 @@ public class CleanGraphSparkJob { conf, isSparkSessionManaged, spark -> { - removeOutputDir(spark, outputPath); - fixGraphTable(spark, vocs, inputPath, entityClazz, outputPath); + HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration()); + cleanGraphTable(spark, vocs, inputPath, entityClazz, outputPath); }); } - private static void fixGraphTable( + private static void cleanGraphTable( SparkSession spark, VocabularyGroup vocs, String inputPath, @@ -100,13 +102,15 @@ public class CleanGraphSparkJob { return spark .read() .textFile(inputEntityPath) + .filter((FilterFunction) s -> isEntityType(s, clazz)) + .map((MapFunction) s -> StringUtils.substringAfter(s, "|"), Encoders.STRING()) .map( (MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + private static boolean isEntityType(final String s, final Class clazz) { + return StringUtils.substringBefore(s, "|").equals(clazz.getName()); } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/groupbyid/GroupEntitiesAndRelationsSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/GroupEntitiesAndRelationsSparkJob.java similarity index 99% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/groupbyid/GroupEntitiesAndRelationsSparkJob.java rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/GroupEntitiesAndRelationsSparkJob.java index 1d887cecb..9c80528e3 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/groupbyid/GroupEntitiesAndRelationsSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/GroupEntitiesAndRelationsSparkJob.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.oa.graph.groupbyid; +package eu.dnetlib.dhp.oa.graph.clean; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.utils.DHPUtils.toSeq; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/groupbyid/DispatchEntitiesSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/groupbyid/DispatchEntitiesSparkJob.java deleted file mode 100644 index 1b4226411..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/groupbyid/DispatchEntitiesSparkJob.java +++ /dev/null @@ -1,97 +0,0 @@ - -package eu.dnetlib.dhp.oa.graph.groupbyid; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - -import java.util.Optional; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.FilterFunction; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.OafEntity; -import scala.Tuple2; - -public class DispatchEntitiesSparkJob { - - private static final Logger log = LoggerFactory.getLogger(DispatchEntitiesSparkJob.class); - - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - - public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - MigrateMongoMdstoresApplication.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/dispatch_entities_bytype_parameters.json"))); - parser.parseArgument(args); - - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - - final String entitiesPath = parser.get("entitiesPath"); - log.info("entitiesPath: {}", entitiesPath); - - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); - - String graphTableClassName = parser.get("graphTableClassName"); - log.info("graphTableClassName: {}", graphTableClassName); - - Class entityClazz = (Class) Class.forName(graphTableClassName); - - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration()); - dispatchOaf(spark, entityClazz, entitiesPath, outputPath); - }); - } - - private static void dispatchOaf( - final SparkSession spark, - final Class clazz, - final String sourcePath, - final String targetPath) { - - log.info("Processing entities ({}) in file: {}", clazz.getName(), sourcePath); - - spark - .read() - .textFile(sourcePath) - .filter((FilterFunction) s -> isEntityType(s, clazz)) - .map((MapFunction) s -> StringUtils.substringAfter(s, "|"), Encoders.STRING()) - .write() - .option("compression", "gzip") - .mode(SaveMode.Overwrite) - .text(targetPath); - } - - private static boolean isEntityType(final String s, final Class clazz) { - return StringUtils.substringBefore(s, "|").equals(clazz.getName()); - } - -} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml index 7329df29a..8b6ca9de6 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml @@ -50,12 +50,36 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + yarn + cluster + group graph entities and relations + eu.dnetlib.dhp.oa.graph.clean.GroupEntitiesAndRelationsSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --graphInputPath${graphInputPath} + --outputPath${workingDir}/grouped_entities + + + + + diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/groupbyid/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/groupbyid/oozie_app/config-default.xml deleted file mode 100644 index 2e0ed9aee..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/groupbyid/oozie_app/config-default.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/groupbyid/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/groupbyid/oozie_app/workflow.xml deleted file mode 100644 index 70b2ea68e..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/groupbyid/oozie_app/workflow.xml +++ /dev/null @@ -1,293 +0,0 @@ - - - - - graphInputPath - the graph root input path - - - outputPath - the graph root output path - - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - - - oozieActionShareLibForSpark2 - oozie action sharelib for spark 2.* - - - spark2ExtraListeners - com.cloudera.spark.lineage.NavigatorAppListener - spark 2.* extra listeners classname - - - spark2SqlQueryExecutionListeners - com.cloudera.spark.lineage.NavigatorQueryListener - spark 2.* sql query execution listeners classname - - - spark2YarnHistoryServerAddress - spark 2.* yarn history server address - - - spark2EventLogDir - spark 2.* event log dir location - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - yarn - cluster - group graph entities and relations - eu.dnetlib.dhp.oa.graph.groupbyid.GroupEntitiesAndRelationsSparkJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - - --graphInputPath${graphInputPath} - --outputPath${workingDir}/grouped_entities - - - - - - - - - - - - - - - - - - - yarn - cluster - Dispatch publications - eu.dnetlib.dhp.oa.graph.groupbyid.DispatchEntitiesSparkJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - - --entitiesPath${workingDir}/grouped_entities - --outputPath${outputPath}/publication - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication - - - - - - - - yarn - cluster - Dispatch datasets - eu.dnetlib.dhp.oa.graph.groupbyid.DispatchEntitiesSparkJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - - --entitiesPath${workingDir}/grouped_entities - --outputPath${outputPath}/dataset - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset - - - - - - - - yarn - cluster - Dispatch softwares - eu.dnetlib.dhp.oa.graph.groupbyid.DispatchEntitiesSparkJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - - --entitiesPath${workingDir}/grouped_entities - --outputPath${outputPath}/software - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software - - - - - - - - yarn - cluster - Dispatch otherresearchproducts - eu.dnetlib.dhp.oa.graph.groupbyid.DispatchEntitiesSparkJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - - --entitiesPath${workingDir}/grouped_entities - --outputPath${outputPath}/otherresearchproduct - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - - - - - - - - yarn - cluster - Dispatch datasources - eu.dnetlib.dhp.oa.graph.groupbyid.DispatchEntitiesSparkJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - - --entitiesPath${workingDir}/grouped_entities - --outputPath${outputPath}/datasource - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Datasource - - - - - - - - yarn - cluster - Dispatch organizations - eu.dnetlib.dhp.oa.graph.groupbyid.DispatchEntitiesSparkJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - - --entitiesPath${workingDir}/grouped_entities - --outputPath${outputPath}/organization - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Organization - - - - - - - - yarn - cluster - Dispatch project - eu.dnetlib.dhp.oa.graph.groupbyid.DispatchEntitiesSparkJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - - --entitiesPath${workingDir}/grouped_entities - --outputPath${outputPath}/project - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Project - - - - - - - - yarn - cluster - Dispatch relations - eu.dnetlib.dhp.oa.graph.groupbyid.DispatchEntitiesSparkJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - - --entitiesPath${workingDir}/grouped_entities - --outputPath${outputPath}/relation - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Relation - - - - - - - - - \ No newline at end of file From 2facfefc190b47496fbc2806c3ff9cacf365d3af Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 13 Nov 2020 15:38:40 +0100 Subject: [PATCH 13/17] updated maven repository URL --- dhp-build/dhp-code-style/pom.xml | 4 ++-- pom.xml | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dhp-build/dhp-code-style/pom.xml b/dhp-build/dhp-code-style/pom.xml index e60e8076e..77aa2aedb 100644 --- a/dhp-build/dhp-code-style/pom.xml +++ b/dhp-build/dhp-code-style/pom.xml @@ -15,12 +15,12 @@ dnet45-snapshots DNet45 Snapshots - http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots + https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots default dnet45-releases - http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases + https://maven.d4science.org/nexus/content/repositories/dnet45-releases diff --git a/pom.xml b/pom.xml index 03c69108d..e9b90a765 100644 --- a/pom.xml +++ b/pom.xml @@ -50,7 +50,7 @@ dnet45-releases D-Net 45 releases - http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases + https://maven.d4science.org/nexus/content/repositories/dnet45-releases default false @@ -639,12 +639,12 @@ dnet45-snapshots DNet45 Snapshots - http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots + https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots default dnet45-releases - http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases + https://maven.d4science.org/nexus/content/repositories/dnet45-releases From 5d4e34e26afc5346828a01ae8b5648f2fd908aef Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Sat, 14 Nov 2020 10:32:26 +0100 Subject: [PATCH 14/17] fixed typo in variable name --- .../dhp/oa/graph/merge/oozie_app/workflow.xml | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml index 5af117d68..86fb51042 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml @@ -88,8 +88,8 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=7680 - --betaInputPath${betaInputGgraphPath}/publication - --prodInputPath${prodInputGgraphPath}/publication + --betaInputPath${betaInputGraphPath}/publication + --prodInputPath${prodInputGraphPath}/publication --outputPath${graphOutputPath}/publication --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication --priority${priority} @@ -115,8 +115,8 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=7680 - --betaInputPath${betaInputGgraphPath}/dataset - --prodInputPath${prodInputGgraphPath}/dataset + --betaInputPath${betaInputGraphPath}/dataset + --prodInputPath${prodInputGraphPath}/dataset --outputPath${graphOutputPath}/dataset --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset --priority${priority} @@ -142,8 +142,8 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=7680 - --betaInputPath${betaInputGgraphPath}/otherresearchproduct - --prodInputPath${prodInputGgraphPath}/otherresearchproduct + --betaInputPath${betaInputGraphPath}/otherresearchproduct + --prodInputPath${prodInputGraphPath}/otherresearchproduct --outputPath${graphOutputPath}/otherresearchproduct --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct --priority${priority} @@ -169,8 +169,8 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=7680 - --betaInputPath${betaInputGgraphPath}/software - --prodInputPath${prodInputGgraphPath}/software + --betaInputPath${betaInputGraphPath}/software + --prodInputPath${prodInputGraphPath}/software --outputPath${graphOutputPath}/software --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software --priority${priority} @@ -196,8 +196,8 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=7680 - --betaInputPath${betaInputGgraphPath}/datasource - --prodInputPath${prodInputGgraphPath}/datasource + --betaInputPath${betaInputGraphPath}/datasource + --prodInputPath${prodInputGraphPath}/datasource --outputPath${graphOutputPath}/datasource --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Datasource --priority${priority} @@ -223,8 +223,8 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=7680 - --betaInputPath${betaInputGgraphPath}/organization - --prodInputPath${prodInputGgraphPath}/organization + --betaInputPath${betaInputGraphPath}/organization + --prodInputPath${prodInputGraphPath}/organization --outputPath${graphOutputPath}/organization --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Organization --priority${priority} @@ -250,8 +250,8 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=7680 - --betaInputPath${betaInputGgraphPath}/project - --prodInputPath${prodInputGgraphPath}/project + --betaInputPath${betaInputGraphPath}/project + --prodInputPath${prodInputGraphPath}/project --outputPath${graphOutputPath}/project --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Project --priority${priority} @@ -277,8 +277,8 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=7680 - --betaInputPath${betaInputGgraphPath}/relation - --prodInputPath${prodInputGgraphPath}/relation + --betaInputPath${betaInputGraphPath}/relation + --prodInputPath${prodInputGraphPath}/relation --outputPath${graphOutputPath}/relation --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Relation --priority${priority} From 331d62180060caee96f38431a65cae0960be7eca Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Sat, 14 Nov 2020 12:16:15 +0100 Subject: [PATCH 15/17] added test resource --- .../eu/dnetlib/dhp/oa/graph/raw/oaf_orp.xml | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_orp.xml diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_orp.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_orp.xml new file mode 100644 index 000000000..6c83073de --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_orp.xml @@ -0,0 +1,84 @@ + + +

+ pensoft_____::00ea4a1cd53806a97d62ea6bf268f2a2 + 10.3897/oneeco.2.e13718 + + + + + + 2020-03-23T00:20:51.392Z + 2020-03-23T00:26:59.078Z + pensoft_____ +
+ + Ecosystem Service capacity is higher in areas of multiple designation types + Nikolaidou,Charitini + Votsi,Nefta + Sgardelis,Steanos + Halley,John + Pantis,John + Tsiafouli,Maria + 2017 + The implementation of the Ecosystem Service (ES) concept into practice might be a challenging task as it has to take into account previous “traditional” policies and approaches that have evaluated nature and biodiversity differently. Among them the Habitat (92/43/EC) and Bird Directives (79/409/EC), the Water Framework Directive (2000/60/EC), and the Noise Directive (2002/49/EC) have led to the evaluation/designation of areas in Europe with different criteria. In this study our goal was to understand how the ES capacity of an area is related to its designation and if areas with multiple designations have higher capacity in providing ES. We selected four catchments in Greece with a great variety of characteristics covering over 25% of the national territory. Inside the catchments we assessed the ES capacity (following the methodology of Burkhard et al. 2009) of areas designated as Natura 2000 sites, Quiet areas and Wetlands or Water bodies and found those areas that have multiple designations. Data were analyzed by GLM to reveal differences regarding the ES capacity among the different types of areas. We also investigated by PCA synergies and trade-offs among different kinds of ES and tested for correlations among landscape properties, such as elevation, aspect and slope and the ES potential. Our results show that areas with different types or multiple designations have a different capacity in providing ES. Areas of one designation type (Protected or Quiet Areas) had in general intermediate scores in most ES but scores were higher compared to areas with no designation, which displayed stronger capacity in provisioning services. Among Protected Areas and Quiet Areas the latter scored better in general. Areas that combined both designation types (Protected and Quiet Areas) showed the highest capacity in 13 out of 29 ES, that were mostly linked with natural and forest ecosystems. We found significant synergies among most regulating, supporting and cultural ES which in turn display trade-offs with provisioning services. The different ES are spatially related and display strong correlation with landscape properties, such as elevation and slope. We suggest that the designation status of an area can be used as an alternative tool for environmental policy, indicating the capacity for ES provision. Multiple designations of areas can be used as proxies for locating ES “hotspots”. This integration of “traditional” evaluation and designation and the “newer” ES concept forms a time- and cost-effective way to be adopted by stakeholders and policy-makers in order to start complying with new standards and demands for nature conservation and environmental management. + text/html + https://doi.org/10.3897/oneeco.2.e13718 + https://oneecosystem.pensoft.net/article/13718/ + eng + Pensoft Publishers + info:eu-repo/semantics/altIdentifier/eissn/2367-8194 + info:eu-repo/grantAgreement/EC/FP7/226852 + One Ecosystem 2: e13718 + One Ecosystem 2: e13718 + One Ecosystem 2: e13718 + Ecosystem Services hotspots + Natura 2000 + Quiet Protected Areas + Biodiversity + Agriculture + Elevation + Slope + Ecosystem Service trade-offs and synergies + cultural services + provisioning services + regulating services + supporting services + Research Article + + 0020 + 2017-01-01 + corda_______::226852 + OPEN + + + 10.3897/oneeco.2.e13718 + https://oneecosystem.pensoft.net/article/13718/ + One Ecosystem + 0001 + + + + + http%3A%2F%2Fzookeys.pensoft.net%2Foai.php + 10.3897/oneeco.2.e13718 + 2017-09-08 + http://www.openarchives.org/OAI/2.0/oai_dc/ + + + + false + false + 0.9 + + + + + \ No newline at end of file From 4de8c8b237dc8a93a9d4ed3fe7b8745bb4f567d3 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 16 Nov 2020 10:03:11 +0100 Subject: [PATCH 16/17] fixed workflow variable name --- .../dhp/oa/graph/clean/oozie_app/workflow.xml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml index 8b6ca9de6..992d8c40e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml @@ -108,7 +108,7 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=7680 - --inputPath${graphInputPath}/publication + --inputPath${workingDir}/grouped_entities --outputPath${graphOutputPath}/publication --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication --isLookupUrl${isLookupUrl} @@ -134,7 +134,7 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=7680 - --inputPath${graphInputPath}/dataset + --inputPath${workingDir}/grouped_entities --outputPath${graphOutputPath}/dataset --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset --isLookupUrl${isLookupUrl} @@ -160,7 +160,7 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=7680 - --inputPath${graphInputPath}/otherresearchproduct + --inputPath${workingDir}/grouped_entities --outputPath${graphOutputPath}/otherresearchproduct --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct --isLookupUrl${isLookupUrl} @@ -186,7 +186,7 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=7680 - --inputPath${graphInputPath}/software + --inputPath${workingDir}/grouped_entities --outputPath${graphOutputPath}/software --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software --isLookupUrl${isLookupUrl} @@ -212,7 +212,7 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=7680 - --inputPath${graphInputPath}/datasource + --inputPath${workingDir}/grouped_entities --outputPath${graphOutputPath}/datasource --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Datasource --isLookupUrl${isLookupUrl} @@ -238,7 +238,7 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=7680 - --inputPath${graphInputPath}/organization + --inputPath${workingDir}/grouped_entities --outputPath${graphOutputPath}/organization --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Organization --isLookupUrl${isLookupUrl} @@ -264,7 +264,7 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=7680 - --inputPath${graphInputPath}/project + --inputPath${workingDir}/grouped_entities --outputPath${graphOutputPath}/project --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Project --isLookupUrl${isLookupUrl} @@ -290,7 +290,7 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=7680 - --inputPath${graphInputPath}/relation + --inputPath${workingDir}/grouped_entities --outputPath${graphOutputPath}/relation --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Relation --isLookupUrl${isLookupUrl} From 6ab1ce53c9e96fe719097588b003ad55609c3b8b Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 16 Nov 2020 10:09:17 +0100 Subject: [PATCH 17/17] fixed condition in result pid cleaning; cleanup --- .../dhp/oa/graph/clean/CleaningFunctions.java | 2 +- .../dispatch_entities_bytype_parameters.json | 26 ------------------- 2 files changed, 1 insertion(+), 27 deletions(-) delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dispatch_entities_bytype_parameters.json diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java index 9f06ea056..4bcce8037 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java @@ -114,7 +114,7 @@ public class CleaningFunctions { .stream() .filter(Objects::nonNull) .filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue()))) - .filter(sp -> NONE.equalsIgnoreCase(sp.getValue())) + .filter(sp -> !NONE.equalsIgnoreCase(sp.getValue().trim())) .filter(sp -> Objects.nonNull(sp.getQualifier())) .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) .map(sp -> { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dispatch_entities_bytype_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dispatch_entities_bytype_parameters.json deleted file mode 100644 index a75ff3653..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dispatch_entities_bytype_parameters.json +++ /dev/null @@ -1,26 +0,0 @@ -[ - { - "paramName": "issm", - "paramLongName": "isSparkSessionManaged", - "paramDescription": "when true will stop SparkSession after job execution", - "paramRequired": false - }, - { - "paramName": "ep", - "paramLongName": "entitiesPath", - "paramDescription": "the entities path", - "paramRequired": true - }, - { - "paramName": "g", - "paramLongName": "outputPath", - "paramDescription": "the output path to store the dispatched entities", - "paramRequired": true - }, - { - "paramName": "class", - "paramLongName": "graphTableClassName", - "paramDescription": "class name modelling the graph table", - "paramRequired": true - } -] \ No newline at end of file