From 6a288625e58eb7252ed7dbfd16aa0ae709168438 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 17 Feb 2020 15:04:33 +0100 Subject: [PATCH 01/11] fixed workflow outgoing node --- .../resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml index 309a6d90f0..dd6998db08 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml @@ -113,7 +113,7 @@ -pguser${postgresUser} -pgpasswd${postgresPassword} - + From 0f364605ffa63941e622225eca3050a5b2e287e0 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 18 Feb 2020 11:48:19 +0100 Subject: [PATCH 02/11] removed stale tests, need to reimplemente them anyway --- .../dnetlib/dhp/graph/MappingUtilsTest.java | 66 ------------------- .../dhp/graph/XmlRecordFactoryTest.java | 55 ---------------- 2 files changed, 121 deletions(-) delete mode 100644 dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/MappingUtilsTest.java delete mode 100644 dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/XmlRecordFactoryTest.java diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/MappingUtilsTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/MappingUtilsTest.java deleted file mode 100644 index a9d696beaa..0000000000 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/MappingUtilsTest.java +++ /dev/null @@ -1,66 +0,0 @@ -package eu.dnetlib.dhp.graph; - -import eu.dnetlib.dhp.graph.model.EntityRelEntity; -import eu.dnetlib.dhp.graph.model.RelatedEntity; -import eu.dnetlib.dhp.graph.utils.GraphMappingUtils; -import org.codehaus.jackson.map.ObjectMapper; -import org.junit.Before; -import org.junit.Test; - -import java.io.IOException; -import java.io.InputStreamReader; - -public class MappingUtilsTest { - - private GraphMappingUtils utils; - - @Before - public void setUp() { - utils = new GraphMappingUtils(); - } - - @Test - public void testOafMappingDatasource() throws IOException { - - final InputStreamReader in = new InputStreamReader(getClass().getResourceAsStream("datasource.json")); - final EntityRelEntity e = new ObjectMapper().readValue(in, EntityRelEntity.class); - e.getSource().setType("datasource"); - - final EntityRelEntity out = utils.asRelatedEntity(e); - System.out.println(out); - - } - - //@Test - public void testOafMappingResult() throws IOException { - - final InputStreamReader in = new InputStreamReader(getClass().getResourceAsStream("result.json")); - final EntityRelEntity e = new ObjectMapper().readValue(in, EntityRelEntity.class); - - final EntityRelEntity out = utils.asRelatedEntity(e); - System.out.println(out); - - } - - @Test - public void testOafMappingSoftware() throws IOException { - - final InputStreamReader in = new InputStreamReader(getClass().getResourceAsStream("software.json")); - final EntityRelEntity e = new ObjectMapper().readValue(in, EntityRelEntity.class); - - final EntityRelEntity out = utils.asRelatedEntity(e); - System.out.println(out); - - } - - - @Test - public void testParseRelatedEntity() throws IOException { - - final InputStreamReader in = new InputStreamReader(getClass().getResourceAsStream("related_entity.json")); - final RelatedEntity e = new ObjectMapper().readValue(in, RelatedEntity.class); - - System.out.println(e); - - } -} diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/XmlRecordFactoryTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/XmlRecordFactoryTest.java deleted file mode 100644 index 2a3c343ec6..0000000000 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/XmlRecordFactoryTest.java +++ /dev/null @@ -1,55 +0,0 @@ -package eu.dnetlib.dhp.graph; - -import eu.dnetlib.dhp.graph.utils.ContextMapper; -import org.apache.commons.io.FileUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.spark.sql.SparkSession; -import org.junit.After; -import org.junit.Before; -import org.junit.Test; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; - -public class XmlRecordFactoryTest { - - private static final Log log = LogFactory.getLog(XmlRecordFactoryTest.class); - - private Path testDir; - - @Before - public void setup() throws IOException { - testDir = Files.createTempDirectory(getClass().getSimpleName()); - log.info("created test directory " + testDir.toString()); - } - - @After - public void tearDown() throws IOException { - FileUtils.deleteDirectory(testDir.toFile()); - log.info("deleted test directory " + testDir.toString()); - } - - @Test - public void testXmlSerialization() throws Exception { - - final SparkSession spark = SparkSession - .builder() - .appName(SparkXmlRecordBuilderJob.class.getSimpleName()) - .master("local[*]") - .getOrCreate(); - - final String inputDir = testDir.toString() + "/3_joined_entities"; - FileUtils.forceMkdir(new File(inputDir)); - FileUtils.copyFile(new File("/Users/claudio/Downloads/joined_entities-part-00000"), new File(inputDir + "/joined_entities-part-00000")); - - final ContextMapper ctx = ContextMapper.fromIS("https://dev-openaire.d4science.org:443/is/services/isLookUp"); - - final GraphJoiner g = new GraphJoiner(spark, ctx, inputDir, testDir.toString()); - - g.asXML(); - } - -} From ed76521d9b0d5c761262997aa064d980bda46dd5 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 18 Feb 2020 11:50:39 +0100 Subject: [PATCH 03/11] removed stale test resources, will be re-added later on --- .../src/test/resources/eu/dnetlib/dhp/graph/datasource.json | 1 - .../test/resources/eu/dnetlib/dhp/graph/related_entity.json | 5 ----- .../src/test/resources/eu/dnetlib/dhp/graph/result.json | 1 - .../src/test/resources/eu/dnetlib/dhp/graph/software.json | 1 - 4 files changed, 8 deletions(-) delete mode 100644 dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/datasource.json delete mode 100644 dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/related_entity.json delete mode 100644 dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/result.json delete mode 100644 dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/software.json diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/datasource.json b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/datasource.json deleted file mode 100644 index c26154c1ef..0000000000 --- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/datasource.json +++ /dev/null @@ -1 +0,0 @@ -{"source":{"sourceId":"10|CRIS_UNS____::f66f1bd369679b5b077dcdf006089556","targetId":null,"deleted":false,"oaf":"{\"datasourcetype\":{\"classid\":\"crissystem\",\"classname\":\"CRIS System\",\"schemeid\":\"dnet:datasource_typologies\",\"schemename\":\"dnet:datasource_typologies\"},\"openairecompatibility\":{\"classid\":\"openaire-cris_1.1\",\"classname\":\"OpenAIRE CRIS v1.1\",\"schemeid\":\"dnet:datasourceCompatibilityLevel\",\"schemename\":\"dnet:datasourceCompatibilityLevel\"},\"officialname\":{\"value\":\"CRIS UNS (Current Research Information System University of Novi Sad)\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"englishname\":{\"value\":\"CRIS UNS (Current Research Information System University of Novi Sad)\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"websiteurl\":{\"value\":\"https://cris.uns.ac.rs/\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"logourl\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"contactemail\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"namespaceprefix\":{\"value\":\"CrisUnsNoviS\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"latitude\":{\"value\":\"0.0\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"longitude\":{\"value\":\"0.0\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"dateofvalidation\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"description\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"subjects\":[],\"odnumberofitems\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"odnumberofitemsdate\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"odpolicies\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"odlanguages\":[],\"odcontenttypes\":[],\"accessinfopackage\":[{\"value\":\"https://cris.uns.ac.rs/OAIHandlerOpenAIRECRIS\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}}],\"releasestartdate\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"releaseenddate\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"missionstatementurl\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"dataprovider\":{\"value\":false,\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"serviceprovider\":{\"value\":false,\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"databaseaccesstype\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"datauploadtype\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"databaseaccessrestriction\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"datauploadrestriction\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"versioning\":{\"value\":false,\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"citationguidelineurl\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"qualitymanagementkind\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"pidsystems\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"certificates\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"policies\":[],\"journal\":{\"name\":\"\",\"issnPrinted\":\"\",\"issnOnline\":\"\",\"issnLinking\":\"\",\"ep\":\"\",\"iss\":\"\",\"sp\":\"\",\"vol\":\"\",\"edition\":\"\",\"conferenceplace\":\"\",\"conferencedate\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"id\":\"10|CRIS_UNS____::f66f1bd369679b5b077dcdf006089556\",\"originalId\":[\"CRIS_UNS____::openaire\"],\"collectedfrom\":[{\"key\":\"\",\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}}],\"pid\":[],\"extraInfo\":[],\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:entityregistry\",\"classname\":\"sysimport:crosswalk:entityregistry\",\"schemeid\":\"dnet:provenance_actions\",\"schemename\":\"dnet:provenance_actions\"}},\"lastupdatetimestamp\":0}"},"relation":null,"target":null} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/related_entity.json b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/related_entity.json deleted file mode 100644 index 25c92baa3d..0000000000 --- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/related_entity.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "id": "20|nih_________::6b8108b6d6399f7163a6a7ccdd0efc2d", - "type": "organization", - "legalname": "MCGILL UNIVERSITY" -} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/result.json b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/result.json deleted file mode 100644 index 5d6c3f29bf..0000000000 --- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/result.json +++ /dev/null @@ -1 +0,0 @@ -{"source":{"sourceId":"50|od_______165::5642f79c597bac8029fde12a80f75412","targetId":null,"deleted":true,"type":"otherresearchproduct","oaf":"{\"contactperson\":[],\"contactgroup\":[],\"tool\":[],\"author\":[{\"fullname\":\"Cartier, Adrien\",\"name\":\"Adrien\",\"surname\":\"Cartier\",\"rank\":1,\"pid\":[],\"affiliation\":[]},{\"fullname\":\"Larroudé, Philippe\",\"name\":\"Philippe\",\"surname\":\"Larroudé\",\"rank\":2,\"pid\":[],\"affiliation\":[]},{\"fullname\":\"Héquette, Arnaud\",\"name\":\"Arnaud\",\"surname\":\"Héquette\",\"rank\":3,\"pid\":[],\"affiliation\":[]}],\"resulttype\":{\"classid\":\"other\",\"classname\":\"other\",\"schemeid\":\"dnet:result_typologies\",\"schemename\":\"dnet:result_typologies\"},\"language\":{\"classid\":\"eng\",\"classname\":\"English\",\"schemeid\":\"dnet:languages\",\"schemename\":\"dnet:languages\"},\"country\":[],\"subject\":[{\"value\":\"[SDU.STU.OC] Sciences of the Universe/Earth Sciences/Oceanography\",\"qualifier\":{\"classid\":\"keyword\",\"classname\":\"keyword\",\"schemeid\":\"dnet:subject_classification_typologies\",\"schemename\":\"dnet:subject_classification_typologies\"},\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}}],\"title\":[{\"value\":\"Longshore Sediment Transport Measurements on Sandy Macrotidal Beaches Compared with Sediment Transport Formulae\",\"qualifier\":{\"classid\":\"main title\",\"classname\":\"main title\",\"schemeid\":\"dnet:dataCite_title\",\"schemename\":\"dnet:dataCite_title\"},\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}}],\"relevantdate\":[],\"description\":[{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}}],\"dateofacceptance\":{\"value\":\"2013-03-13\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"publisher\":{\"value\":\"intech\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"embargoenddate\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"source\":[{\"value\":\"Sediment Transport Processes and Their Modelling Applications\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},{\"value\":\"https://hal.archives-ouvertes.fr/hal-00824453\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},{\"value\":\"Sediment Transport Processes and Their Modelling Applications, intech, chapitre 2, 2013, 978-953-51-1039-2,. \\u0026lt;10.5772/51023\\u0026gt;\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}}],\"fulltext\":[],\"format\":[],\"contributor\":[{\"value\":\"Equipe Morphodynamique des littoraux (Dunkerque) ; Laboratoire d\\u0027Océanologie et de Géosciences (LOG) ; Université du Littoral Côte d\\u0027Opale - Université Lille I - Sciences et technologies - CNRS - Université du Littoral Côte d\\u0027Opale - Université Lille I - Sciences et technologies - CNRS\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},{\"value\":\"Laboratoire des écoulements géophysiques et industriels (LEGI) ; Université Joseph Fourier - Grenoble I - Institut polytechnique de Grenoble (Grenoble INP) - CNRS\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},{\"value\":\"Laboratoire d\\u0027Océanologie et de Géosciences (LOG) ; Université du Littoral Côte d\\u0027Opale - Université Lille I - Sciences et technologies - CNRS\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}}],\"resourcetype\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"},\"coverage\":[],\"refereed\":{\"value\":\"\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}},\"context\":[],\"id\":\"50|od_______165::5642f79c597bac8029fde12a80f75412\",\"originalId\":[\"oai:HAL:hal-00824453v1\"],\"collectedfrom\":[{\"key\":\"10|opendoar____::9766527f2b5d3e95d4a733fcfb77bd7e\",\"value\":\"INRIA a CCSD electronic archive server\",\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}}],\"pid\":[{\"value\":\"10.5772/51023\",\"qualifier\":{\"classid\":\"doi\",\"classname\":\"doi\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"dataInfo\":{\"invisible\":false,\"inferred\":false,\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"provenanceaction\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"}}}],\"extraInfo\":[],\"dataInfo\":{\"invisible\":false,\"inferred\":true,\"deletedbyinference\":true,\"inferenceprovenance\":\"dedup-similarity-result-levenstein\",\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"sysimport:crosswalk:repository\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"}},\"lastupdatetimestamp\":0}"},"relation":null,"target":null} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/software.json b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/software.json deleted file mode 100644 index 0065b6799d..0000000000 --- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/graph/software.json +++ /dev/null @@ -1 +0,0 @@ -{"type":"software","entity":{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0,"id":"50|od______2659::05817f64c43a918a07483340b5726f77","originalId":["oai:zenodo.org:204139"],"collectedfrom":[{"key":"10|opendoar____::358aee4cc897452c00244351e4d91f69","value":"ZENODO","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"pid":[],"extraInfo":[],"author":[],"resulttype":{"classid":"software","classname":"software","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"und","classname":"Undetermined","schemeid":"dnet:languages","schemename":"dnet:languages"},"country":[],"subject":[],"title":[],"relevantdate":[],"description":[],"dateofacceptance":{"value":"2016-01-01","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"refereed":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"context":[],"instance":[{"license":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"instancetype":{"classid":"0029","classname":"Software","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"hostedby":{"key":"10|opendoar____::358aee4cc897452c00244351e4d91f69","value":"ZENODO","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"url":[],"distributionlocation":"","collectedfrom":{"key":"10|opendoar____::358aee4cc897452c00244351e4d91f69","value":"ZENODO","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"dateofacceptance":{"value":"2016-01-01","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}}],"documentationUrl":[],"license":[],"codeRepositoryUrl":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"programmingLanguage":{"classid":"","classname":"","schemeid":"","schemename":""}},"links":[{"relation":{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______2659::05817f64c43a918a07483340b5726f77","target":"40|corda__h2020::e2a38892773e6541ec7c07aa605ad581"},"relatedEntity":{"id":"40|corda__h2020::e2a38892773e6541ec7c07aa605ad581","type":"project","projectTitle":"Engaging the EGI Community towards an Open Science Commons","code":"654142","acronym":"EGI-Engage","contracttype":{},"fundingtree":["{value=ec__________::ECECEuropean CommissionEUec__________::EC::H2020::RIAResearch and Innovation actionRIAec:h2020toasec__________::EC::H2020H2020Horizon 2020 Framework Programmeec:h2020fundings, dataInfo={invisible=false, inferred=false, deletedbyinference=false, trust=, inferenceprovenance=, provenanceaction={classid=, classname=, schemeid=, schemename=}}}"]}},{"relation":{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______2659::05817f64c43a918a07483340b5726f77","target":"40|corda_______::4d31ccb13726266f9098129756e03f43"},"relatedEntity":{"id":"40|corda_______::4d31ccb13726266f9098129756e03f43","type":"project","projectTitle":"Common Operations of Environmental Research Infrastructures","code":"283465","acronym":"ENVRI","contracttype":{},"fundingtree":["{value=ec__________::ECECEuropean CommissionEUec__________::EC::FP7::SP4::INFRAResearch InfrastructuresINFRAec:programec__________::EC::FP7::SP4SP4-CapacitiesSP4ec:specificprogramec__________::EC::FP7SEVENTH FRAMEWORK PROGRAMMEFP7ec:frameworkprogram, dataInfo={invisible=false, inferred=false, deletedbyinference=false, trust=, inferenceprovenance=, provenanceaction={classid=, classname=, schemeid=, schemename=}}}"]}},{"relation":{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______2659::05817f64c43a918a07483340b5726f77","target":"40|corda_______::5af7655a8e0e871cf16072b4b6ab9b41"},"relatedEntity":{"id":"40|corda_______::5af7655a8e0e871cf16072b4b6ab9b41","type":"project","projectTitle":"Data e-Infrastructure Initiative for Fisheries Management and Conservation of Marine Living Resources","code":"283644","acronym":"IMARINE","contracttype":{},"fundingtree":["{value=ec__________::ECECEuropean CommissionEUec__________::EC::FP7::SP4::INFRAResearch InfrastructuresINFRAec:programec__________::EC::FP7::SP4SP4-CapacitiesSP4ec:specificprogramec__________::EC::FP7SEVENTH FRAMEWORK PROGRAMMEFP7ec:frameworkprogram, dataInfo={invisible=false, inferred=false, deletedbyinference=false, trust=, inferenceprovenance=, provenanceaction={classid=, classname=, schemeid=, schemename=}}}"]}},{"relation":{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______2659::05817f64c43a918a07483340b5726f77","target":"40|corda_______::e8da2e3e130ad3b1a650487d9ff126e4"},"relatedEntity":{"id":"40|corda_______::e8da2e3e130ad3b1a650487d9ff126e4","type":"project","projectTitle":"EU-Brazil Open Data and Cloud Computing e-Infrastructure for Biodiversity","code":"288754","acronym":"EUBRAZILOPENBIO","contracttype":{},"fundingtree":["{value=ec__________::ECECEuropean CommissionEUec__________::EC::FP7::SP1::ICTInformation and Communication TechnologiesICTec:programec__________::EC::FP7::SP1SP1-CooperationSP1ec:specificprogramec__________::EC::FP7SEVENTH FRAMEWORK PROGRAMMEFP7ec:frameworkprogram, dataInfo={invisible=false, inferred=false, deletedbyinference=false, trust=, inferenceprovenance=, provenanceaction={classid=, classname=, schemeid=, schemename=}}}"]}},{"relation":{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______2659::05817f64c43a918a07483340b5726f77","target":"40|corda_______::15463ed3cba51f042181197cfabb2ff5"},"relatedEntity":{"id":"40|corda_______::15463ed3cba51f042181197cfabb2ff5","type":"project","projectTitle":"Data Infrastructure Ecosystem for Science","code":"239019","acronym":"D4SCIENCE-II","contracttype":{},"fundingtree":["{value=ec__________::ECECEuropean CommissionEUec__________::EC::FP7::SP4::INFRAResearch InfrastructuresINFRAec:programec__________::EC::FP7::SP4SP4-CapacitiesSP4ec:specificprogramec__________::EC::FP7SEVENTH FRAMEWORK PROGRAMMEFP7ec:frameworkprogram, dataInfo={invisible=false, inferred=false, deletedbyinference=false, trust=, inferenceprovenance=, provenanceaction={classid=, classname=, schemeid=, schemename=}}}"]}},{"relation":{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______2659::05817f64c43a918a07483340b5726f77","target":"40|corda__h2020::4d46893df18bb77f5d817b8ce98ac56c"},"relatedEntity":{"id":"40|corda__h2020::4d46893df18bb77f5d817b8ce98ac56c","type":"project","projectTitle":"Pooling Activities, Resources and Tools for Heritage E-research Networking, Optimization and Synergies","code":"654119","acronym":"PARTHENOS","contracttype":{},"fundingtree":["{value=ec__________::ECECEuropean CommissionEUec__________::EC::H2020::RIAResearch and Innovation actionRIAec:h2020toasec__________::EC::H2020H2020Horizon 2020 Framework Programmeec:h2020fundings, dataInfo={invisible=false, inferred=false, deletedbyinference=false, trust=, inferenceprovenance=, provenanceaction={classid=, classname=, schemeid=, schemename=}}}"]}},{"relation":{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______2659::05817f64c43a918a07483340b5726f77","target":"40|corda_______::7f18b83690e3a18134b9a3db66d882d3"},"relatedEntity":{"id":"40|corda_______::7f18b83690e3a18134b9a3db66d882d3","type":"project","projectTitle":"DIstributed colLaboratories Infrastructure on Grid ENabled Technology 4 Science","code":"212488","acronym":"D4SCIENCE","contracttype":{},"fundingtree":["{value=ec__________::ECECEuropean CommissionEUec__________::EC::FP7::SP4::INFRAResearch InfrastructuresINFRAec:programec__________::EC::FP7::SP4SP4-CapacitiesSP4ec:specificprogramec__________::EC::FP7SEVENTH FRAMEWORK PROGRAMMEFP7ec:frameworkprogram, dataInfo={invisible=false, inferred=false, deletedbyinference=false, trust=, inferenceprovenance=, provenanceaction={classid=, classname=, schemeid=, schemename=}}}"]}},{"relation":{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______2659::05817f64c43a918a07483340b5726f77","target":"40|corda__h2020::6729c0ee95de7724deb60454bb4179de"},"relatedEntity":{"id":"40|corda__h2020::6729c0ee95de7724deb60454bb4179de","type":"project","projectTitle":"Building Research environments for fostering Innovation, Decision making, Governance and Education to support Blue growth","code":"675680","acronym":"BlueBRIDGE","contracttype":{},"fundingtree":["{value=ec__________::ECECEuropean CommissionEUec__________::EC::H2020::RIAResearch and Innovation actionRIAec:h2020toasec__________::EC::H2020H2020Horizon 2020 Framework Programmeec:h2020fundings, dataInfo={invisible=false, inferred=false, deletedbyinference=false, trust=, inferenceprovenance=, provenanceaction={classid=, classname=, schemeid=, schemename=}}}"]}},{"relation":{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______2659::05817f64c43a918a07483340b5726f77","target":"40|corda__h2020::0da81b3ad78047f577dd405e8a2d7f07"},"relatedEntity":{"id":"40|corda__h2020::0da81b3ad78047f577dd405e8a2d7f07","type":"project","projectTitle":"Environmental Research Infrastructures Providing Shared Solutions for Science and Society","code":"654182","acronym":"ENVRI PLUS","contracttype":{},"fundingtree":["{value=ec__________::ECECEuropean CommissionEUec__________::EC::H2020::RIAResearch and Innovation actionRIAec:h2020toasec__________::EC::H2020H2020Horizon 2020 Framework Programmeec:h2020fundings, dataInfo={invisible=false, inferred=false, deletedbyinference=false, trust=, inferenceprovenance=, provenanceaction={classid=, classname=, schemeid=, schemename=}}}"]}},{"relation":{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0,"relType":"resultProject","subRelType":"outcome","relClass":"isProducedBy","source":"50|od______2659::05817f64c43a918a07483340b5726f77","target":"40|corda__h2020::e7f5e7755409fc74eea9d168ab795634"},"relatedEntity":{"id":"40|corda__h2020::e7f5e7755409fc74eea9d168ab795634","type":"project","projectTitle":"SoBigData Research Infrastructure","code":"654024","acronym":"SoBigData","contracttype":{},"fundingtree":["{value=ec__________::ECECEuropean CommissionEUec__________::EC::H2020::RIAResearch and Innovation actionRIAec:h2020toasec__________::EC::H2020H2020Horizon 2020 Framework Programmeec:h2020fundings, dataInfo={invisible=false, inferred=false, deletedbyinference=false, trust=, inferenceprovenance=, provenanceaction={classid=, classname=, schemeid=, schemename=}}}"]}}]} \ No newline at end of file From e5d7cdf4226e552cd7757563083d218316d636ac Mon Sep 17 00:00:00 2001 From: "sandro.labruzzo" Date: Wed, 19 Feb 2020 10:13:36 +0100 Subject: [PATCH 04/11] fixed sql query --- .../dhp/migration/oozie_app/workflow.xml | 2 +- .../dhp/migration/sql/queryProjects.sql | 19 +++++++++---------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml index 309a6d90f0..59111b31d3 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml @@ -43,7 +43,7 @@ - + diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects.sql index 6cff188756..685b57ab65 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects.sql +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects.sql @@ -28,15 +28,17 @@ SELECT p.summary AS summary, p.currency AS currency, p.totalcost AS totalcost, - p.fundedamount AS fundedamount, + p.fundedamount AS fundedamount, dc.id AS collectedfromid, dc.officialname AS collectedfromname, - ctc.code || '@@@' || ctc.name || '@@@' || cts.code || '@@@' || cts.name AS contracttype, - pac.code || '@@@' || pac.name || '@@@' || pas.code || '@@@' || pas.name AS provenanceaction, - array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid, - array_agg(DISTINCT s.name || '###' || sc.code || '@@@' || sc.name || '@@@' || ss.code || '@@@' || ss.name) AS subjects, - array_agg(DISTINCT fp.path) AS fundingtree + p.contracttype || '@@@' || p.contracttypename || '@@@' || p.contracttypescheme || '@@@' || p.contracttypescheme AS contracttype, + pac.code || '@@@' || pac.name || '@@@' || pas.code || '@@@' || pas.name AS provenanceaction, + array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid, + array_agg(DISTINCT s.name || '###' || sc.code || '@@@' || sc.name || '@@@' || ss.code || '@@@' || ss.name) AS subjects, + array_agg(DISTINCT fp.path) AS fundingtree + FROM projects p + LEFT OUTER JOIN class pac ON (pac.code = p.provenanceactionclass) LEFT OUTER JOIN scheme pas ON (pas.code = p.provenanceactionscheme) @@ -54,9 +56,6 @@ SELECT LEFT OUTER JOIN class sc ON (sc.code = s.semanticclass) LEFT OUTER JOIN scheme ss ON (ss.code = s.semanticscheme) - LEFT OUTER JOIN class ctc ON (ctc.code = p.contracttypeclass) - LEFT OUTER JOIN scheme cts ON (cts.code = p.contracttypescheme) - GROUP BY p.id, p.code, @@ -87,4 +86,4 @@ SELECT dc.id, dc.officialname, pac.code, pac.name, pas.code, pas.name, - ctc.code, ctc.name, cts.code, cts.name; \ No newline at end of file + p.contracttype , p.contracttypename, p.contracttypescheme; \ No newline at end of file From 173f1df1e51a451b8af8fb17b5b418b42ee47934 Mon Sep 17 00:00:00 2001 From: Michele Artini Date: Wed, 19 Feb 2020 10:15:08 +0100 Subject: [PATCH 05/11] saved a query for openaire production database --- .../sql/queryProjects_production.sql | 90 +++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects_production.sql diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects_production.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects_production.sql new file mode 100644 index 0000000000..6cff188756 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects_production.sql @@ -0,0 +1,90 @@ +SELECT + p.id AS projectid, + p.code AS code, + p.websiteurl AS websiteurl, + p.acronym AS acronym, + p.title AS title, + p.startdate AS startdate, + p.enddate AS enddate, + p.call_identifier AS callidentifier, + p.keywords AS keywords, + p.duration AS duration, + p.ec_sc39 AS ecsc39, + p.oa_mandate_for_publications AS oamandatepublications, + p.ec_article29_3 AS ecarticle29_3, + p.dateofcollection AS dateofcollection, + p.lastupdate AS dateoftransformation, + p.inferred AS inferred, + p.deletedbyinference AS deletedbyinference, + p.trust AS trust, + p.inferenceprovenance AS inferenceprovenance, + p.optional1 AS optional1, + p.optional2 AS optional2, + p.jsonextrainfo AS jsonextrainfo, + p.contactfullname AS contactfullname, + p.contactfax AS contactfax, + p.contactphone AS contactphone, + p.contactemail AS contactemail, + p.summary AS summary, + p.currency AS currency, + p.totalcost AS totalcost, + p.fundedamount AS fundedamount, + dc.id AS collectedfromid, + dc.officialname AS collectedfromname, + ctc.code || '@@@' || ctc.name || '@@@' || cts.code || '@@@' || cts.name AS contracttype, + pac.code || '@@@' || pac.name || '@@@' || pas.code || '@@@' || pas.name AS provenanceaction, + array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid, + array_agg(DISTINCT s.name || '###' || sc.code || '@@@' || sc.name || '@@@' || ss.code || '@@@' || ss.name) AS subjects, + array_agg(DISTINCT fp.path) AS fundingtree + FROM projects p + LEFT OUTER JOIN class pac ON (pac.code = p.provenanceactionclass) + LEFT OUTER JOIN scheme pas ON (pas.code = p.provenanceactionscheme) + + LEFT OUTER JOIN projectpids pp ON (pp.project = p.id) + LEFT OUTER JOIN dsm_identities i ON (i.pid = pp.pid) + + LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom) + + LEFT OUTER JOIN project_fundingpath pf ON (pf.project = p.id) + LEFT OUTER JOIN fundingpaths fp ON (fp.id = pf.funding) + + LEFT OUTER JOIN project_subject ps ON (ps.project = p.id) + LEFT OUTER JOIN subjects s ON (s.id = ps.subject) + + LEFT OUTER JOIN class sc ON (sc.code = s.semanticclass) + LEFT OUTER JOIN scheme ss ON (ss.code = s.semanticscheme) + + LEFT OUTER JOIN class ctc ON (ctc.code = p.contracttypeclass) + LEFT OUTER JOIN scheme cts ON (cts.code = p.contracttypescheme) + + GROUP BY + p.id, + p.code, + p.websiteurl, + p.acronym, + p.title, + p.startdate, + p.enddate, + p.call_identifier, + p.keywords, + p.duration, + p.ec_sc39, + p.oa_mandate_for_publications, + p.ec_article29_3, + p.dateofcollection, + p.inferred, + p.deletedbyinference, + p.trust, + p.inferenceprovenance, + p.contactfullname, + p.contactfax, + p.contactphone, + p.contactemail, + p.summary, + p.currency, + p.totalcost, + p.fundedamount, + dc.id, + dc.officialname, + pac.code, pac.name, pas.code, pas.name, + ctc.code, ctc.name, cts.code, cts.name; \ No newline at end of file From 5d3739b5cf0ffd2a967ffc6f8e86082cafd39a3f Mon Sep 17 00:00:00 2001 From: Michele Artini Date: Wed, 19 Feb 2020 15:11:17 +0100 Subject: [PATCH 06/11] migration of claims --- .../migration/AbstractMigrationExecutor.java | 14 ++ .../dhp/migration/AbstractMongoExecutor.java | 2 + .../migration/ExtractEntitiesFromHDFSJob.java | 84 ++++--- .../MigrateDbEntitiesApplication.java | 113 +++++++-- ...extract_entities_from_hdfs_parameters.json | 10 +- .../migrate_db_entities_parameters.json | 6 + .../dhp/migration/oozie_app/workflow.xml | 215 ++++++------------ .../dnetlib/dhp/migration/sql/queryClaims.sql | 1 + 8 files changed, 238 insertions(+), 207 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryClaims.sql diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java index e91a530458..b0db3c76fd 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrationExecutor.java @@ -227,7 +227,21 @@ public class AbstractMigrationExecutor implements Closeable { final String nsPrefix = StringUtils.substringBefore(originalId, "::"); final String rest = StringUtils.substringAfter(originalId, "::"); return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest)); + } + public static String createOpenaireId(final String type, final String originalId) { + switch (type) { + case "datasource": + return createOpenaireId(10, originalId); + case "organization": + return createOpenaireId(20, originalId); + case "person": + return createOpenaireId(30, originalId); + case "project": + return createOpenaireId(40, originalId); + default: + return createOpenaireId(50, originalId); + } } public static String asString(final Object o) { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java index 00d1aa60d6..0595726d45 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java @@ -398,6 +398,8 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { protected DataInfo prepareDataInfo(final Document doc) { final Node n = doc.selectSingleNode("//oaf:datainfo"); + if (n == null) { return null; } + final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid"); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/ExtractEntitiesFromHDFSJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/ExtractEntitiesFromHDFSJob.java index f2d9caebf6..22b61798e6 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/ExtractEntitiesFromHDFSJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/ExtractEntitiesFromHDFSJob.java @@ -1,56 +1,68 @@ package eu.dnetlib.dhp.migration; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import java.util.Arrays; +import java.util.List; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.Text; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; -import scala.Tuple2; -import java.util.Arrays; -import java.util.List; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Software; +import scala.Tuple2; public class ExtractEntitiesFromHDFSJob { + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils.toString(MigrateMongoMdstoresApplication.class + .getResourceAsStream("/eu/dnetlib/dhp/migration/extract_entities_from_hdfs_parameters.json"))); + parser.parseArgument(args); - private static List folderNames = Arrays.asList("db_entities", "oaf_entities", "odf_entities"); + final SparkSession spark = SparkSession + .builder() + .appName(ExtractEntitiesFromHDFSJob.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/extract_entities_from_hdfs_parameters.json"))); - parser.parseArgument(args); + final List sourcePaths = Arrays.asList(parser.get("sourcePaths").split(",")); + final String targetPath = parser.get("graphRawPath"); - final SparkSession spark = SparkSession - .builder() - .appName(ExtractEntitiesFromHDFSJob.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + try (final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext())) { + processEntity(sc, Publication.class, sourcePaths, targetPath); + processEntity(sc, Dataset.class, sourcePaths, targetPath); + processEntity(sc, Software.class, sourcePaths, targetPath); + processEntity(sc, OtherResearchProduct.class, sourcePaths, targetPath); + processEntity(sc, Datasource.class, sourcePaths, targetPath); + processEntity(sc, Organization.class, sourcePaths, targetPath); + processEntity(sc, Project.class, sourcePaths, targetPath); + processEntity(sc, Relation.class, sourcePaths, targetPath); + } + } - final String sourcePath = parser.get("sourcePath"); - final String targetPath = parser.get("graphRawPath"); - final String entity = parser.get("entity"); + private static void processEntity(final JavaSparkContext sc, final Class clazz, final List sourcePaths, final String targetPath) { + final String type = clazz.getSimpleName().toLowerCase(); + final JavaRDD inputRdd = sc.emptyRDD(); + sourcePaths.forEach(sourcePath -> inputRdd.union(sc.sequenceFile(sourcePath, Text.class, Text.class) + .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) + .filter(k -> isEntityType(k._1(), type)) + .map(Tuple2::_2))); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + inputRdd.saveAsTextFile(targetPath + "/" + type); + } - - JavaRDD inputRdd = sc.emptyRDD(); - - - folderNames.forEach(p -> inputRdd.union( - sc.sequenceFile(sourcePath+"/"+p, Text.class, Text.class) - .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) - .filter(k -> isEntityType(k._1(), entity)) - .map(Tuple2::_2)) - ); - - inputRdd.saveAsTextFile(targetPath+"/"+entity); - } - - - private static boolean isEntityType(final String item, final String entity) { - return StringUtils.substringAfter(item, ":").equalsIgnoreCase(entity); - } + private static boolean isEntityType(final String item, final String entity) { + return StringUtils.substringAfter(item, ":").equalsIgnoreCase(entity); + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java index d22e8e5b36..1ccfd09ef0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java @@ -17,15 +17,21 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Context; import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.Datasource; import eu.dnetlib.dhp.schema.oaf.Field; import eu.dnetlib.dhp.schema.oaf.Journal; import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor implements Closeable { @@ -53,22 +59,28 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl final String hdfsNameNode = parser.get("namenode"); final String hdfsUser = parser.get("hdfsUser"); + final boolean processClaims = parser.get("action") != null && parser.get("action").equalsIgnoreCase("claims"); + try (final MigrateDbEntitiesApplication smdbe = new MigrateDbEntitiesApplication(hdfsPath, hdfsNameNode, hdfsUser, dbUrl, dbUser, dbPassword)) { - log.info("Processing datasources..."); - smdbe.execute("queryDatasources.sql", smdbe::processDatasource); + if (processClaims) { + log.info("Processing claims..."); + smdbe.execute("queryClaims.sql", smdbe::processClaims); + } else { + log.info("Processing datasources..."); + smdbe.execute("queryDatasources.sql", smdbe::processDatasource); - log.info("Processing projects..."); - smdbe.execute("queryProjects.sql", smdbe::processProject); + log.info("Processing projects..."); + smdbe.execute("queryProjects.sql", smdbe::processProject); - log.info("Processing orgs..."); - smdbe.execute("queryOrganizations.sql", smdbe::processOrganization); + log.info("Processing orgs..."); + smdbe.execute("queryOrganizations.sql", smdbe::processOrganization); - log.info("Processing relations ds <-> orgs ..."); - smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization); - - log.info("Processing projects <-> orgs ..."); - smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization); + log.info("Processing relations ds <-> orgs ..."); + smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization); + log.info("Processing projects <-> orgs ..."); + smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization); + } log.info("All done."); } } @@ -377,7 +389,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl r2.setTarget(dsId); r2.setCollectedFrom(collectedFrom); r2.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); + r2.setLastupdatetimestamp(lastUpdateTimestamp); emitOaf(r2); // rs.getString("datasource"); @@ -426,7 +438,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl r2.setTarget(projectId); r2.setCollectedFrom(collectedFrom); r2.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); + r2.setLastupdatetimestamp(lastUpdateTimestamp); emitOaf(r2); // rs.getString("project"); @@ -450,6 +462,81 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl } } + public void processClaims(final ResultSet rs) { + + final DataInfo info = + dataInfo(false, null, false, false, qualifier("user:claim", "user:claim", "dnet:provenanceActions", "dnet:provenanceActions"), "0.9"); + + try { + + if (rs.getString("source_type").equals("context")) { + final Result r; + + if (rs.getString("target_type").equals("dataset")) { + r = new Dataset(); + } else if (rs.getString("target_type").equals("software")) { + r = new Software(); + } else if (rs.getString("target_type").equals("other")) { + r = new OtherResearchProduct(); + } else { + r = new Publication(); + } + r.setId(createOpenaireId(50, rs.getString("target_id"))); + r.setLastupdatetimestamp(lastUpdateTimestamp); + r.setContext(prepareContext(rs.getString("source_id"), info)); + r.setDataInfo(info); + emitOaf(r); + } else { + final String sourceId = createOpenaireId(rs.getString("source_type"), rs.getString("source_id")); + final String targetId = createOpenaireId(rs.getString("target_type"), rs.getString("target_id")); + + final Relation r1 = new Relation(); + final Relation r2 = new Relation(); + + if (rs.getString("source_type").equals("project")) { + r1.setRelType("resultProject"); + r1.setSubRelType("outcome"); + r1.setRelClass("produces"); + + r2.setRelType("resultProject"); + r2.setSubRelType("outcome"); + r2.setRelClass("isProducedBy"); + } else { + r1.setRelType("resultResult"); + r1.setSubRelType("relationship"); + r1.setRelClass("isRelatedTo"); + + r2.setRelType("resultResult"); + r2.setSubRelType("relationship"); + r2.setRelClass("isRelatedTo"); + } + + r1.setSource(sourceId); + r1.setTarget(targetId); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + emitOaf(r1); + + r2.setSource(targetId); + r2.setTarget(sourceId); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + emitOaf(r2); + + } + + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + private List prepareContext(final String id, final DataInfo dataInfo) { + final Context context = new Context(); + context.setId(id); + context.setDataInfo(Arrays.asList(dataInfo)); + return Arrays.asList(context); + } + private DataInfo prepareDataInfo(final ResultSet rs) throws SQLException { final Boolean deletedbyinference = rs.getBoolean("deletedbyinference"); final String inferenceprovenance = rs.getString("inferenceprovenance"); diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/extract_entities_from_hdfs_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/extract_entities_from_hdfs_parameters.json index f179ee0f80..0039493e7b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/extract_entities_from_hdfs_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/extract_entities_from_hdfs_parameters.json @@ -1,8 +1,8 @@ [ { "paramName": "s", - "paramLongName": "sourcePath", - "paramDescription": "the HDFS source path which contains the sequential file", + "paramLongName": "sourcePaths", + "paramDescription": "the HDFS source paths which contains the sequential file (comma separated)", "paramRequired": true }, { @@ -16,11 +16,5 @@ "paramLongName": "graphRawPath", "paramDescription": "the path of the graph Raw in hdfs", "paramRequired": true - }, - { - "paramName": "e", - "paramLongName": "entity", - "paramDescription": "The entity to extract", - "paramRequired": true } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json index 5e9f378f55..4506e2ae1a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json @@ -34,5 +34,11 @@ "paramLongName": "postgresPassword", "paramDescription": "postgres password", "paramRequired": false + }, + { + "paramName": "a", + "paramLongName": "action", + "paramDescription": "process claims", + "paramRequired": false } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml index ff23fff4a3..b11cddfcf1 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml @@ -43,8 +43,7 @@ - - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -71,6 +70,23 @@ -dbuser${postgresUser} -dbpasswd${postgresPassword} + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.migration.MigrateDbEntitiesApplication + -p${workingPath}/db_claims + -n${nameNode} + -u${hdfsUser} + -dburl${postgresURL} + -dbuser${postgresUser} + -dbpasswd${postgresPassword} + -aclaims + @@ -113,170 +129,69 @@ -pguser${postgresUser} -pgpasswd${postgresPassword} - + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.migration.MigrateMongoMdstoresApplication + -p${workingPath}/odf_claims + -n${nameNode} + -u${hdfsUser} + -mongourl${mongourl} + -db${mongoDb} + -fODF + -lstore + -iclaim + -pgurl${postgresURL} + -pguser${postgresUser} + -pgpasswd${postgresPassword} + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.migration.MigrateMongoMdstoresApplication + -p${workingPath}/oaf_claims + -n${nameNode} + -u${hdfsUser} + -mongourl${mongourl} + -db${mongoDb} + -fOAF + -lstore + -iclaim + -pgurl${postgresURL} + -pguser${postgresUser} + -pgpasswd${postgresPassword} + + + + - + ${jobTracker} ${nameNode} yarn-cluster cluster - ExtractEntities: publication + ExtractEntities eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob dhp-aggregation-${projectVersion}.jar --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" -mt yarn-cluster - --sourcePath${workingPath} - -g${graphRawPath}/publication - -epublication - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - ExtractEntities: dataset - eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob - dhp-aggregation-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" - -mt yarn-cluster - --sourcePath${workingPath} - -g${graphRawPath}/dataset - -edataset - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - ExtractEntities: software - eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob - dhp-aggregation-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" - -mt yarn-cluster - --sourcePath${workingPath} - -g${graphRawPath}/software - -esoftware - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - ExtractEntities: otherresearchproduct - eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob - dhp-aggregation-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" - -mt yarn-cluster - --sourcePath${workingPath} - -g${graphRawPath}/otherresearchproduct - -eotherresearchproduct - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - ExtractEntities: datasource - eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob - dhp-aggregation-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" - -mt yarn-cluster - --sourcePath${workingPath} - -g${graphRawPath}/datasource - -edatasource - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - ExtractEntities: organization - eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob - dhp-aggregation-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" - -mt yarn-cluster - --sourcePath${workingPath} - -g${graphRawPath}/organization - -eorganization - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - ExtractEntities: project - eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob - dhp-aggregation-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" - -mt yarn-cluster - --sourcePath${workingPath} - -g${graphRawPath}/project - -eproject - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - ExtractEntities: relation - eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob - dhp-aggregation-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" - -mt yarn-cluster - --sourcePath${workingPath} - -g${graphRawPath}/relation - -erelation + -s${workingPath}/db_entities,${workingPath}/oaf_entities,${workingPath}/odf_entities + -g${graphRawPath} - - + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryClaims.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryClaims.sql new file mode 100644 index 0000000000..0390c11aa3 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryClaims.sql @@ -0,0 +1 @@ +SELECT source_type, source_id, target_type, target_id, semantics FROM claim WHERE approved=TRUE; \ No newline at end of file From 33185fd0b7e923c36e0f0a451dd1412910f3dcbb Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 19 Feb 2020 16:56:38 +0100 Subject: [PATCH 07/11] ISLookupClientFactory moved in dhp-common --- dhp-common/pom.xml | 12 ++++++++++++ .../eu/dnetlib/dhp}/utils/ISLookupClientFactory.java | 2 +- dhp-workflows/dhp-graph-provision/pom.xml | 8 -------- .../eu/dnetlib/dhp/graph/SparkXmlIndexingJob.java | 2 +- .../eu/dnetlib/dhp/graph/utils/ContextMapper.java | 1 + pom.xml | 7 +++++++ 6 files changed, 22 insertions(+), 10 deletions(-) rename {dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph => dhp-common/src/main/java/eu/dnetlib/dhp}/utils/ISLookupClientFactory.java (96%) diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index 4416bd4fff..a9fb39ea08 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -46,6 +46,18 @@ net.sf.saxon Saxon-HE + + org.slf4j + jcl-over-slf4j + + + org.apache.cxf + cxf-rt-transports-http + + + eu.dnetlib + cnr-rmi-api + diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ISLookupClientFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java similarity index 96% rename from dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ISLookupClientFactory.java rename to dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java index d87f29452d..c74cf3c111 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ISLookupClientFactory.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dhp.graph.utils; +package eu.dnetlib.dhp.utils; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import org.apache.commons.logging.Log; diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml index f74c9b6668..ac4e01d218 100644 --- a/dhp-workflows/dhp-graph-provision/pom.xml +++ b/dhp-workflows/dhp-graph-provision/pom.xml @@ -66,14 +66,6 @@ zookeeper - - org.apache.cxf - cxf-rt-transports-http - - - eu.dnetlib - cnr-rmi-api - eu.dnetlib.dhp diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlIndexingJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlIndexingJob.java index 2775d93b43..63ff8fb312 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlIndexingJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlIndexingJob.java @@ -2,8 +2,8 @@ package eu.dnetlib.dhp.graph; import com.lucidworks.spark.util.SolrSupport; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.graph.utils.ISLookupClientFactory; import eu.dnetlib.dhp.graph.utils.StreamingInputDocumentFactory; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpDocumentNotFoundException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ContextMapper.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ContextMapper.java index 0c3a481d02..ad9e7dfadd 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ContextMapper.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ContextMapper.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.graph.utils; import com.google.common.base.Joiner; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import org.dom4j.Document; diff --git a/pom.xml b/pom.xml index 7f5e1e3dcd..faed1db356 100644 --- a/pom.xml +++ b/pom.xml @@ -129,6 +129,13 @@ provided + + org.slf4j + jcl-over-slf4j + 1.7.25 + provided + + org.apache.commons commons-lang3 From d42dde52baff933c5bac8141fdf21668c846a247 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 19 Feb 2020 17:29:05 +0100 Subject: [PATCH 08/11] implemented method to merge relations --- .../eu/dnetlib/dhp/schema/oaf/Relation.java | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java index 5cf0883beb..d404981f41 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java @@ -1,6 +1,11 @@ package eu.dnetlib.dhp.schema.oaf; +import org.junit.Assert; + +import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; public class Relation extends Oaf { @@ -14,7 +19,7 @@ public class Relation extends Oaf { private String target; - private List collectedFrom; + private List collectedFrom = new ArrayList<>(); public String getRelType() { return relType; @@ -63,4 +68,19 @@ public class Relation extends Oaf { public void setCollectedFrom(List collectedFrom) { this.collectedFrom = collectedFrom; } + + public void mergeFrom(final Relation r) { + Assert.assertEquals("source ids must be equal", getSource(), r.getSource()); + Assert.assertEquals("target ids must be equal", getTarget(), r.getTarget()); + Assert.assertEquals("relType(s) must be equal", getRelType(), r.getRelType()); + Assert.assertEquals("subRelType(s) must be equal", getSubRelType(), r.getSubRelType()); + Assert.assertEquals("relClass(es) must be equal", getRelClass(), r.getRelClass()); + setCollectedFrom( + Stream.concat( + getCollectedFrom().stream(), + r.getCollectedFrom().stream()) + .distinct() // relies on KeyValue.equals + .collect(Collectors.toList())); + } + } From 4c94e74a8475f5d1aeea1f475bf4b0ab4722e7f3 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Thu, 20 Feb 2020 11:43:32 +0100 Subject: [PATCH 09/11] Added a missing dependency --- dhp-schemas/pom.xml | 6 + .../eu/dnetlib/dhp/schema/oaf/Relation.java | 111 +++++++++--------- pom.xml | 3 +- 3 files changed, 62 insertions(+), 58 deletions(-) diff --git a/dhp-schemas/pom.xml b/dhp-schemas/pom.xml index d8873d33d0..89e52858be 100644 --- a/dhp-schemas/pom.xml +++ b/dhp-schemas/pom.xml @@ -30,6 +30,12 @@ com.fasterxml.jackson.core jackson-databind + + + junit + junit + ${junit.version} + eu.dnetlib.dhp diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java index d404981f41..24a363bec2 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java @@ -1,86 +1,83 @@ package eu.dnetlib.dhp.schema.oaf; -import org.junit.Assert; - import java.util.ArrayList; import java.util.List; import java.util.stream.Collectors; import java.util.stream.Stream; +import org.junit.Assert; + public class Relation extends Oaf { - private String relType; + private String relType; - private String subRelType; + private String subRelType; - private String relClass; + private String relClass; - private String source; + private String source; - private String target; + private String target; - private List collectedFrom = new ArrayList<>(); + private List collectedFrom = new ArrayList<>(); - public String getRelType() { - return relType; - } + public String getRelType() { + return relType; + } - public void setRelType(String relType) { - this.relType = relType; - } + public void setRelType(final String relType) { + this.relType = relType; + } - public String getSubRelType() { - return subRelType; - } + public String getSubRelType() { + return subRelType; + } - public void setSubRelType(String subRelType) { - this.subRelType = subRelType; - } + public void setSubRelType(final String subRelType) { + this.subRelType = subRelType; + } - public String getRelClass() { - return relClass; - } + public String getRelClass() { + return relClass; + } - public void setRelClass(String relClass) { - this.relClass = relClass; - } + public void setRelClass(final String relClass) { + this.relClass = relClass; + } - public String getSource() { - return source; - } + public String getSource() { + return source; + } - public void setSource(String source) { - this.source = source; - } + public void setSource(final String source) { + this.source = source; + } - public String getTarget() { - return target; - } + public String getTarget() { + return target; + } - public void setTarget(String target) { - this.target = target; - } + public void setTarget(final String target) { + this.target = target; + } - public List getCollectedFrom() { - return collectedFrom; - } + public List getCollectedFrom() { + return collectedFrom; + } - public void setCollectedFrom(List collectedFrom) { - this.collectedFrom = collectedFrom; - } + public void setCollectedFrom(final List collectedFrom) { + this.collectedFrom = collectedFrom; + } - public void mergeFrom(final Relation r) { - Assert.assertEquals("source ids must be equal", getSource(), r.getSource()); - Assert.assertEquals("target ids must be equal", getTarget(), r.getTarget()); - Assert.assertEquals("relType(s) must be equal", getRelType(), r.getRelType()); - Assert.assertEquals("subRelType(s) must be equal", getSubRelType(), r.getSubRelType()); - Assert.assertEquals("relClass(es) must be equal", getRelClass(), r.getRelClass()); - setCollectedFrom( - Stream.concat( - getCollectedFrom().stream(), - r.getCollectedFrom().stream()) - .distinct() // relies on KeyValue.equals - .collect(Collectors.toList())); - } + public void mergeFrom(final Relation r) { + Assert.assertEquals("source ids must be equal", getSource(), r.getSource()); + Assert.assertEquals("target ids must be equal", getTarget(), r.getTarget()); + Assert.assertEquals("relType(s) must be equal", getRelType(), r.getRelType()); + Assert.assertEquals("subRelType(s) must be equal", getSubRelType(), r.getSubRelType()); + Assert.assertEquals("relClass(es) must be equal", getRelClass(), r.getRelClass()); + setCollectedFrom(Stream.concat(getCollectedFrom().stream(), r.getCollectedFrom().stream()) + .distinct() // relies on KeyValue.equals + .collect(Collectors.toList())); + } } diff --git a/pom.xml b/pom.xml index faed1db356..74003a4078 100644 --- a/pom.xml +++ b/pom.xml @@ -76,7 +76,7 @@ junit junit - 4.12 + ${junit.version} test @@ -481,6 +481,7 @@ 2.9.6 3.5 2.11.12 + 4.12 3.4.2 From 6a73fd5da561c1249dcc8ff607f0722685ee54e6 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 21 Feb 2020 09:17:19 +0100 Subject: [PATCH 10/11] in order to reuse the same XmlRecordFactory across different tasks, the state of contexts must be one per record built --- .../dhp/graph/utils/XmlRecordFactory.java | 53 ++++++++++--------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlRecordFactory.java index abcf2a7ecb..df34b08d35 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlRecordFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlRecordFactory.java @@ -43,8 +43,6 @@ public class XmlRecordFactory implements Serializable { private String schemaLocation; - private Set contextes = Sets.newHashSet(); - private boolean indent = false; public XmlRecordFactory( @@ -59,15 +57,18 @@ public class XmlRecordFactory implements Serializable { } public String build(final JoinedEntity je) { + + final Set contexts = Sets.newHashSet(); + final OafEntity entity = je.getEntity(); TemplateFactory templateFactory = new TemplateFactory(); try { - final List metadata = metadata(je.getType(), entity); + final List metadata = metadata(je.getType(), entity, contexts); // rels has to be processed before the contexts because they enrich the contextMap with the funding info. - final List relations = listRelations(je, templateFactory); + final List relations = listRelations(je, templateFactory, contexts); - metadata.addAll(buildContexts(getMainType(je.getType()))); + metadata.addAll(buildContexts(getMainType(je.getType()), contexts)); metadata.add(parseDataInfo(entity.getDataInfo())); final String body = templateFactory.buildBody( @@ -97,10 +98,11 @@ public class XmlRecordFactory implements Serializable { } } - private List metadata(final String type, final OafEntity entity) { + private List metadata(final String type, final OafEntity entity, final Set contexts) { final List metadata = Lists.newArrayList(); + if (entity.getCollectedfrom() != null) { metadata.addAll(entity.getCollectedfrom() .stream() @@ -123,6 +125,17 @@ public class XmlRecordFactory implements Serializable { if (GraphMappingUtils.isResult(type)) { final Result r = (Result) entity; + if (r.getContext() != null) { + contexts.addAll(r.getContext() + .stream() + .map(c -> c.getId()) + .collect(Collectors.toList())); + /* FIXME: Workaround for CLARIN mining issue: #3670#note-29 */ + if (contexts.contains("dh-ch::subcommunity::2")) { + contexts.add("clarin"); + } + } + if (r.getTitle() != null) { metadata.addAll(r.getTitle() .stream() @@ -235,16 +248,6 @@ public class XmlRecordFactory implements Serializable { } metadata.add(mapQualifier("bestaccessright", getBestAccessright(r))); - - if (r.getContext() != null) { - contextes.addAll(r.getContext() - .stream() - .map(c -> c.getId()) - .collect(Collectors.toList())); - if (contextes.contains("dh-ch::subcommunity::2")) { - contextes.add("clarin"); - } - } } switch (EntityType.valueOf(type)) { @@ -618,7 +621,7 @@ public class XmlRecordFactory implements Serializable { return bestAccessRight; } - private List listRelations(final JoinedEntity je, TemplateFactory templateFactory) { + private List listRelations(final JoinedEntity je, TemplateFactory templateFactory, final Set contexts) { final List rels = Lists.newArrayList(); for (final Tuple2 link : je.getLinks()) { @@ -699,7 +702,7 @@ public class XmlRecordFactory implements Serializable { if (re.getFundingtree() != null) { metadata.addAll(re.getFundingtree() .stream() - .peek(ft -> fillContextMap(ft)) + .peek(ft -> fillContextMap(ft, contexts)) .map(ft -> getRelFundingTree(ft)) .collect(Collectors.toList())); } @@ -807,14 +810,14 @@ public class XmlRecordFactory implements Serializable { .collect(Collectors.toList()) : Lists.newArrayList(); } - private List buildContexts(final String type) { + private List buildContexts(final String type, final Set contexts) { final List res = Lists.newArrayList(); if ((contextMapper != null) && !contextMapper.isEmpty() && MainEntityType.result.toString().equals(type)) { XMLTag document = XMLDoc.newDocument(true).addRoot("contextRoot"); - for (final String context : contextes) { + for (final String context : contexts) { String id = ""; for (final String token : Splitter.on("::").split(context)) { @@ -882,7 +885,7 @@ public class XmlRecordFactory implements Serializable { return buffer.toString(); } - private void fillContextMap(final String xmlTree) { + private void fillContextMap(final String xmlTree, final Set contexts) { Document fundingPath; try { @@ -896,7 +899,7 @@ public class XmlRecordFactory implements Serializable { if (funder != null) { final String funderShortName = funder.valueOf("./shortname"); - contextes.add(funderShortName); + contexts.add(funderShortName); contextMapper.put(funderShortName, new ContextDef(funderShortName, funder.valueOf("./name"), "context", "funding")); final Node level0 = fundingPath.selectSingleNode("//funding_level_0"); @@ -905,17 +908,17 @@ public class XmlRecordFactory implements Serializable { contextMapper.put(level0Id, new ContextDef(level0Id, level0.valueOf("./description"), "category", "")); final Node level1 = fundingPath.selectSingleNode("//funding_level_1"); if (level1 == null) { - contextes.add(level0Id); + contexts.add(level0Id); } else { final String level1Id = Joiner.on("::").join(level0Id, level1.valueOf("./name")); contextMapper.put(level1Id, new ContextDef(level1Id, level1.valueOf("./description"), "concept", "")); final Node level2 = fundingPath.selectSingleNode("//funding_level_2"); if (level2 == null) { - contextes.add(level1Id); + contexts.add(level1Id); } else { final String level2Id = Joiner.on("::").join(level1Id, level2.valueOf("./name")); contextMapper.put(level2Id, new ContextDef(level2Id, level2.valueOf("./description"), "concept", "")); - contextes.add(level2Id); + contexts.add(level2Id); } } } From 93665773eacc4d11b3d1631f405cfbd61349b998 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 25 Feb 2020 15:59:21 +0100 Subject: [PATCH 11/11] Fixed a problem with JavaRDD Union --- .../migration/ExtractEntitiesFromHDFSJob.java | 46 +++++++++++++++---- .../dhp/migration/oozie_app/workflow.xml | 17 +++++-- 2 files changed, 49 insertions(+), 14 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/ExtractEntitiesFromHDFSJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/ExtractEntitiesFromHDFSJob.java index 22b61798e6..3b6fc9b5de 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/ExtractEntitiesFromHDFSJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/ExtractEntitiesFromHDFSJob.java @@ -1,10 +1,16 @@ package eu.dnetlib.dhp.migration; +import java.io.IOException; import java.util.Arrays; import java.util.List; +import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -23,6 +29,8 @@ import scala.Tuple2; public class ExtractEntitiesFromHDFSJob { + private static final Log log = LogFactory.getLog(ExtractEntitiesFromHDFSJob.class); + public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils.toString(MigrateMongoMdstoresApplication.class @@ -35,10 +43,11 @@ public class ExtractEntitiesFromHDFSJob { .master(parser.get("master")) .getOrCreate(); - final List sourcePaths = Arrays.asList(parser.get("sourcePaths").split(",")); - final String targetPath = parser.get("graphRawPath"); - try (final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext())) { + + final List sourcePaths = Arrays.stream(parser.get("sourcePaths").split(",")).filter(p -> exists(sc, p)).collect(Collectors.toList()); + final String targetPath = parser.get("graphRawPath"); + processEntity(sc, Publication.class, sourcePaths, targetPath); processEntity(sc, Dataset.class, sourcePaths, targetPath); processEntity(sc, Software.class, sourcePaths, targetPath); @@ -53,16 +62,33 @@ public class ExtractEntitiesFromHDFSJob { private static void processEntity(final JavaSparkContext sc, final Class clazz, final List sourcePaths, final String targetPath) { final String type = clazz.getSimpleName().toLowerCase(); - final JavaRDD inputRdd = sc.emptyRDD(); - sourcePaths.forEach(sourcePath -> inputRdd.union(sc.sequenceFile(sourcePath, Text.class, Text.class) - .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) - .filter(k -> isEntityType(k._1(), type)) - .map(Tuple2::_2))); + log.info(String.format("Processing entities (%s) in files:", type)); + sourcePaths.forEach(log::info); + + JavaRDD inputRdd = sc.emptyRDD(); + + for (final String sp : sourcePaths) { + inputRdd = inputRdd.union(sc.sequenceFile(sp, Text.class, Text.class) + .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) + .filter(k -> isEntityType(k._1(), type)) + .map(Tuple2::_2)); + } inputRdd.saveAsTextFile(targetPath + "/" + type); + } - private static boolean isEntityType(final String item, final String entity) { - return StringUtils.substringAfter(item, ":").equalsIgnoreCase(entity); + private static boolean isEntityType(final String item, final String type) { + return StringUtils.substringAfter(item, ":").equalsIgnoreCase(type); + } + + private static boolean exists(final JavaSparkContext context, final String pathToFile) { + try { + final FileSystem hdfs = org.apache.hadoop.fs.FileSystem.get(context.hadoopConfiguration()); + final Path path = new Path(pathToFile); + return hdfs.exists(path); + } catch (final IOException e) { + throw new RuntimeException(e); + } } } diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml index b11cddfcf1..6589633217 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/oozie_app/workflow.xml @@ -43,7 +43,7 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -87,7 +87,7 @@ -dbpasswd${postgresPassword} -aclaims - + @@ -171,11 +171,20 @@ -pguser${postgresUser} -pgpasswd${postgresPassword} - + - + + + + + + + + + + ${jobTracker} ${nameNode}