From 0b5bf53b45316792a5e0fba058ebc0979d6b54ce Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Thu, 23 Feb 2023 12:42:42 +0200 Subject: [PATCH 01/16] Remove unecessary indexed fields from Solr --- .../dhp/oa/provision/XmlIndexingJob.java | 2 +- .../utils/StreamingInputDocumentFactory.java | 25 +- .../dhp/oa/provision/EOSCFuture_Test.java | 3 +- .../provision/IndexRecordTransformerTest.java | 6 +- .../dhp/oa/provision/SolrConfigTest.java | 11 +- .../eu/dnetlib/dhp/oa/provision/fields.xml | 273 +++++++----------- 6 files changed, 120 insertions(+), 200 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java index e7dbdbd2b..1560fcbd9 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java @@ -151,7 +151,7 @@ public class XmlIndexingJob { .sequenceFile(inputPath, Text.class, Text.class) .map(t -> t._2().toString()) .map(s -> toIndexRecord(SaxonTransformerFactory.newInstance(indexRecordXslt), s)) - .map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s)); + .map(s -> new StreamingInputDocumentFactory().parseDocument(s)); switch (outputFormat) { case SOLR: diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java index 36028be9e..b42f9ee83 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java @@ -36,10 +36,6 @@ public class StreamingInputDocumentFactory { private static final String INDEX_FIELD_PREFIX = "__"; - private static final String DS_VERSION = INDEX_FIELD_PREFIX + "dsversion"; - - private static final String DS_ID = INDEX_FIELD_PREFIX + "dsid"; - private static final String RESULT = "result"; private static final String INDEX_RESULT = INDEX_FIELD_PREFIX + RESULT; @@ -65,20 +61,13 @@ public class StreamingInputDocumentFactory { private final ThreadLocal eventFactory = ThreadLocal .withInitial(XMLEventFactory::newInstance); - private final String version; - - private final String dsId; - private String resultName = DEFAULTDNETRESULT; - public StreamingInputDocumentFactory(final String version, final String dsId) { - this(version, dsId, DEFAULTDNETRESULT); + public StreamingInputDocumentFactory() { + this(DEFAULTDNETRESULT); } - public StreamingInputDocumentFactory( - final String version, final String dsId, final String resultName) { - this.version = version; - this.dsId = dsId; + public StreamingInputDocumentFactory(final String resultName) { this.resultName = resultName; } @@ -111,14 +100,6 @@ public class StreamingInputDocumentFactory { } } - if (version != null) { - indexDocument.addField(DS_VERSION, version); - } - - if (dsId != null) { - indexDocument.addField(DS_ID, dsId); - } - if (!indexDocument.containsKey(INDEX_RECORD_ID)) { throw new IllegalStateException("cannot extract record ID from: " + inputDocument); } diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/EOSCFuture_Test.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/EOSCFuture_Test.java index 3e1a501d1..8800abf95 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/EOSCFuture_Test.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/EOSCFuture_Test.java @@ -79,8 +79,7 @@ public class EOSCFuture_Test { final String indexRecordXML = XmlIndexingJob.toIndexRecord(tr, record); - final SolrInputDocument solrDoc = new StreamingInputDocumentFactory(VERSION, DSID) - .parseDocument(indexRecordXML); + final SolrInputDocument solrDoc = new StreamingInputDocumentFactory().parseDocument(indexRecordXML); final String xmlDoc = ClientUtils.toXML(solrDoc); diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java index cd5e08426..ce593cf07 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java @@ -39,9 +39,6 @@ import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory; */ public class IndexRecordTransformerTest { - public static final String VERSION = "2021-04-15T10:05:53Z"; - public static final String DSID = "b9ee796a-c49f-4473-a708-e7d67b84c16d_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl"; - private ContextMapper contextMapper; @BeforeEach @@ -197,8 +194,7 @@ public class IndexRecordTransformerTest { final String indexRecordXML = XmlIndexingJob.toIndexRecord(tr, record); - final SolrInputDocument solrDoc = new StreamingInputDocumentFactory(VERSION, DSID) - .parseDocument(indexRecordXML); + final SolrInputDocument solrDoc = new StreamingInputDocumentFactory().parseDocument(indexRecordXML); final String xmlDoc = ClientUtils.toXML(solrDoc); diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigTest.java index ab98b1da2..451e29128 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigTest.java @@ -115,16 +115,9 @@ public class SolrConfigTest extends SolrTest { for (SolrDocument doc : rsp.getResults()) { System.out .println( - doc.get("score") + "\t" + doc.get("__indexrecordidentifier") + "\t" + - doc.get("resultidentifier") + "\t" + - doc.get("resultauthor") + "\t" + - doc.get("resultacceptanceyear") + "\t" + - doc.get("resultsubject") + "\t" + - doc.get("resulttitle") + "\t" + - doc.get("relprojectname") + "\t" + - doc.get("resultdescription") + "\t" + - doc.get("__all") + "\t"); + doc.get("__result") + "\t" + ); } } } diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml index be2ee7b98..0bf588a57 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml @@ -1,165 +1,116 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file From 23112929e96dbbce3ece27c5a09e3c4e79e2c57f Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 10 Feb 2023 15:34:52 +0100 Subject: [PATCH 02/16] [FoS] changed the default separator from comma to tab to solve the issue in subject value split --- .../main/java/eu/dnetlib/dhp/actionmanager/Constants.java | 1 + .../createunresolvedentities/GetFOSSparkJob.java | 7 +++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java index bd223d7c9..07410969b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java @@ -22,6 +22,7 @@ public class Constants { public static final String DOI_CLASSNAME = "Digital Object Identifier"; public static final String DEFAULT_DELIMITER = ","; + public static final String DEFAULT_FOS_DELIMITER = "\t"; public static final String UPDATE_DATA_INFO_TYPE = "update"; public static final String UPDATE_SUBJECT_FOS_CLASS_ID = "subject:fos"; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFOSSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFOSSparkJob.java index 75fe42e90..c98f1b05a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFOSSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFOSSparkJob.java @@ -1,7 +1,7 @@ package eu.dnetlib.dhp.actionmanager.createunresolvedentities; -import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER; +import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_FOS_DELIMITER; import static eu.dnetlib.dhp.actionmanager.Constants.isSparkSessionManaged; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; @@ -9,8 +9,7 @@ import java.io.Serializable; import java.util.Optional; import org.apache.commons.io.IOUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; + import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.*; @@ -49,7 +48,7 @@ public class GetFOSSparkJob implements Serializable { final String delimiter = Optional .ofNullable(parser.get("delimiter")) - .orElse(DEFAULT_DELIMITER); + .orElse(DEFAULT_FOS_DELIMITER); SparkConf sconf = new SparkConf(); runWithSparkSession( From 31e97c2a6b9048f791cb9831363e9fa29b573330 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 27 Feb 2023 11:38:29 +0100 Subject: [PATCH 03/16] [unresolved entities] updated oozie wf node labels --- .../createunresolvedentities/oozie_app/workflow.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/oozie_app/workflow.xml index a80bf4fbd..c8af64594 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/oozie_app/workflow.xml @@ -86,7 +86,7 @@ yarn cluster - Produces the unresolved from bip finder! + Produces the unresolved from BIP! Finder eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareBipFinder dhp-aggregation-${projectVersion}.jar @@ -135,7 +135,7 @@ yarn cluster - Produces the unresolved from FOS! + Produces the unresolved from FOS eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareFOSSparkJob dhp-aggregation-${projectVersion}.jar @@ -185,7 +185,7 @@ yarn cluster - Produces the unresolved from FOS! + Produces the unresolved from FOS eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareSDGSparkJob dhp-aggregation-${projectVersion}.jar From 80987801d74bafd9f64ff8961f075c4915609775 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 13 Feb 2023 13:03:49 +0100 Subject: [PATCH 04/16] [FoS] added check for null on level1 subject --- .../dnetlib/dhp/actionmanager/Constants.java | 2 +- .../createunresolvedentities/GetFosTest.java | 96 +++++++++++++++++++ .../createunresolvedentities/fos/fos_sbs.tsv | 40 ++++++++ 3 files changed, 137 insertions(+), 1 deletion(-) create mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFosTest.java create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs.tsv diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java index 07410969b..1f2145d66 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java @@ -62,7 +62,7 @@ public class Constants { public static Subject getSubject(String sbj, String classid, String classname, String diqualifierclassid) { - if (sbj.equals(NULL)) + if (sbj == null || sbj.equals(NULL)) return null; Subject s = new Subject(); s.setValue(sbj); diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFosTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFosTest.java new file mode 100644 index 000000000..9a74ae07e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFosTest.java @@ -0,0 +1,96 @@ +package eu.dnetlib.dhp.actionmanager.createunresolvedentities; + +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel; +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocalFileSystem; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * @author miriam.baglioni + * @Date 13/02/23 + */ +public class GetFosTest { + + private static final Logger log = LoggerFactory.getLogger(ProduceTest.class); + + private static Path workingDir; + private static SparkSession spark; + private static LocalFileSystem fs; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(PrepareTest.class.getSimpleName()); + + fs = FileSystem.getLocal(new Configuration()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + conf.setAppName(ProduceTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + spark = SparkSession + .builder() + .appName(PrepareTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + @Test + void test3() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs.tsv") + .getPath(); + + + final String outputPath = workingDir.toString() + "/fos.json"; + GetFOSSparkJob + .main( + new String[] { + "--isSparkSessionManaged", Boolean.FALSE.toString(), + "--sourcePath", sourcePath, + + "-outputPath", outputPath + + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(outputPath) + .map(item -> OBJECT_MAPPER.readValue(item, FOSDataModel.class)); + + tmp.foreach(t -> Assertions.assertTrue(t.getDoi() != null)); + tmp.foreach(t -> Assertions.assertTrue(t.getLevel1() != null)); + tmp.foreach(t -> Assertions.assertTrue(t.getLevel2() != null)); + tmp.foreach(t -> Assertions.assertTrue(t.getLevel3() != null)); + + } +} diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs.tsv b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs.tsv new file mode 100644 index 000000000..98a338e2d --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs.tsv @@ -0,0 +1,40 @@ +doi level1 level2 level3 +10.1080/09638237.2018.1466033 03 medical and health sciences 0302 clinical medicine 030212 general & internal medicine +10.1016/j.dsi.2015.10.003 03 medical and health sciences 0301 basic medicine 030105 genetics & heredity +10.1007/s10072-017-2914-9 03 medical and health sciences 0302 clinical medicine 030217 neurology & neurosurgery +10.1016/j.bspc.2021.102726 02 engineering and technology 0206 medical engineering 020601 biomedical engineering +10.1177/0306312706069439 06 humanities and the arts 0601 history and archaeology 060101 anthropology +10.1016/j.jacep.2016.05.010 03 medical and health sciences 0302 clinical medicine 030212 general & internal medicine +10.1111/anae.13418 03 medical and health sciences 0302 clinical medicine 030212 general & internal medicine +10.1142/s1793744210000168 01 natural sciences 0103 physical sciences 010306 general physics +10.1016/j.jadohealth.2019.04.029 03 medical and health sciences 0302 clinical medicine 030212 general & internal medicine +10.1109/icais50930.2021.9395847 02 engineering and technology 0202 electrical engineering, electronic engineering, information engineering 020201 artificial intelligence & image processing +10.1145/3154837 01 natural sciences 0101 mathematics 010102 general mathematics +10.1038/srep38130 03 medical and health sciences 0301 basic medicine 030106 microbiology +10.1007/s13369-017-2871-x 02 engineering and technology 0202 electrical engineering, electronic engineering, information engineering 020201 artificial intelligence & image processing +10.1063/1.4964718 03 medical and health sciences 0301 basic medicine 030104 developmental biology +10.1007/s12603-019-1276-9 03 medical and health sciences 0302 clinical medicine 030212 general & internal medicine +10.1002/cam4.1463 03 medical and health sciences 0301 basic medicine 030104 developmental biology +10.1164/rccm.201611-2290ed 03 medical and health sciences 0302 clinical medicine 030212 general & internal medicine +10.1088/1757-899x/225/1/012132 01 natural sciences 0105 earth and related environmental sciences 010504 meteorology & atmospheric sciences +10.1117/1.jmm.15.1.015501 02 engineering and technology 0210 nano-technology 021001 nanoscience & nanotechnology +10.1088/1361-6587/ab569d 01 natural sciences 0103 physical sciences 010303 astronomy & astrophysics +10.1016/j.rser.2015.11.092 02 engineering and technology 0202 electrical engineering, electronic engineering, information engineering 020209 energy +10.1016/j.jhydrol.2013.06.035 01 natural sciences 0105 earth and related environmental sciences 010504 meteorology & atmospheric sciences +10.1111/php.12892 03 medical and health sciences 0301 basic medicine 030104 developmental biology +10.1088/0264-9381/27/10/105001 01 natural sciences 0103 physical sciences 010308 nuclear & particles physics +10.1016/j.matchemphys.2018.02.039 02 engineering and technology 0210 nano-technology 021001 nanoscience & nanotechnology +10.1098/rsos.160993 03 medical and health sciences 0301 basic medicine 030104 developmental biology +10.1016/j.rinp.2017.07.054 02 engineering and technology 0209 industrial biotechnology 020901 industrial engineering & automation +10.1111/eip.12348 03 medical and health sciences 0302 clinical medicine 030227 psychiatry +10.20965/jrm.2016.p0371 02 engineering and technology 0201 civil engineering 020101 civil engineering +10.2337/dci19-0036 03 medical and health sciences 0302 clinical medicine 030212 general & internal medicine +10.1155/2018/7692913 01 natural sciences 0104 chemical sciences 010404 medicinal & biomolecular chemistry +10.1117/12.2262306 02 engineering and technology 0202 electrical engineering, electronic engineering, information engineering 020206 networking & telecommunications +10.1021/acs.jpcb.7b01885 01 natural sciences 0104 chemical sciences 010405 organic chemistry +10.1177/0033294117711131 05 social sciences 0502 economics and business 050203 business & management +10.1016/j.jrurstud.2017.08.019 05 social sciences 0502 economics and business 050203 business & management +10.1111/febs.15296 03 medical and health sciences 0301 basic medicine 030104 developmental biology +10.3923/jeasci.2017.6922.6927 05 social sciences 0505 law 050501 criminology +10.1007/s10854-017-6376-x 02 engineering and technology 0202 electrical engineering, electronic engineering, information engineering 020208 electrical & electronic engineering +10.3390/app10176095 02 engineering and technology 0202 electrical engineering, electronic engineering, information engineering 020209 energy \ No newline at end of file From 7aebedb43c5cf9fd8f527f967827118e48eb6e8e Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 27 Feb 2023 11:51:27 +0100 Subject: [PATCH 05/16] code formatting --- .../GetFOSSparkJob.java | 1 - .../usagestats/SparkAtomicActionUsageJob.java | 9 ++- .../createunresolvedentities/GetFosTest.java | 81 ++++++++++--------- .../dhp/oa/provision/SolrConfigTest.java | 5 +- 4 files changed, 50 insertions(+), 46 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFOSSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFOSSparkJob.java index c98f1b05a..0cc2f93df 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFOSSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFOSSparkJob.java @@ -9,7 +9,6 @@ import java.io.Serializable; import java.util.Optional; import org.apache.commons.io.IOUtils; - import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.*; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/usagestats/SparkAtomicActionUsageJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/usagestats/SparkAtomicActionUsageJob.java index b25da07e0..9b444c6fa 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/usagestats/SparkAtomicActionUsageJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/usagestats/SparkAtomicActionUsageJob.java @@ -83,9 +83,12 @@ public class SparkAtomicActionUsageJob implements Serializable { private static void prepareData(String dbname, SparkSession spark, String workingPath, String tableName, String attribute_name) { spark - .sql(String.format( - "select %s as id, sum(downloads) as downloads, sum(views) as views " + - "from %s.%s group by %s", attribute_name, dbname, tableName, attribute_name)) + .sql( + String + .format( + "select %s as id, sum(downloads) as downloads, sum(views) as views " + + "from %s.%s group by %s", + attribute_name, dbname, tableName, attribute_name)) .as(Encoders.bean(UsageStatsModel.class)) .write() .mode(SaveMode.Overwrite) diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFosTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFosTest.java index 9a74ae07e..7e0acc2bb 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFosTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFosTest.java @@ -1,7 +1,10 @@ + package eu.dnetlib.dhp.actionmanager.createunresolvedentities; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -17,9 +20,9 @@ import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel; /** * @author miriam.baglioni @@ -27,48 +30,48 @@ import java.nio.file.Path; */ public class GetFosTest { - private static final Logger log = LoggerFactory.getLogger(ProduceTest.class); + private static final Logger log = LoggerFactory.getLogger(ProduceTest.class); - private static Path workingDir; - private static SparkSession spark; - private static LocalFileSystem fs; - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static Path workingDir; + private static SparkSession spark; + private static LocalFileSystem fs; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - @BeforeAll - public static void beforeAll() throws IOException { - workingDir = Files.createTempDirectory(PrepareTest.class.getSimpleName()); + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(PrepareTest.class.getSimpleName()); - fs = FileSystem.getLocal(new Configuration()); - log.info("using work dir {}", workingDir); + fs = FileSystem.getLocal(new Configuration()); + log.info("using work dir {}", workingDir); - SparkConf conf = new SparkConf(); - conf.setAppName(ProduceTest.class.getSimpleName()); + SparkConf conf = new SparkConf(); + conf.setAppName(ProduceTest.class.getSimpleName()); - conf.setMaster("local[*]"); - conf.set("spark.driver.host", "localhost"); - conf.set("hive.metastore.local", "true"); - conf.set("spark.ui.enabled", "false"); - conf.set("spark.sql.warehouse.dir", workingDir.toString()); - conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); - spark = SparkSession - .builder() - .appName(PrepareTest.class.getSimpleName()) - .config(conf) - .getOrCreate(); - } + spark = SparkSession + .builder() + .appName(PrepareTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } - @AfterAll - public static void afterAll() throws IOException { - FileUtils.deleteDirectory(workingDir.toFile()); - spark.stop(); - } - @Test + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + @Test void test3() throws Exception { - final String sourcePath = getClass() - .getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs.tsv") - .getPath(); - + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs.tsv") + .getPath(); final String outputPath = workingDir.toString() + "/fos.json"; GetFOSSparkJob diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigTest.java index 451e29128..a9d885ecf 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigTest.java @@ -115,9 +115,8 @@ public class SolrConfigTest extends SolrTest { for (SolrDocument doc : rsp.getResults()) { System.out .println( - doc.get("__indexrecordidentifier") + "\t" + - doc.get("__result") + "\t" - ); + doc.get("__indexrecordidentifier") + "\t" + + doc.get("__result") + "\t"); } } } From 78e51c182ab43b7c14d835a9829e4bcf0a510ae6 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Tue, 28 Feb 2023 10:16:01 +0100 Subject: [PATCH 06/16] Added missing parametero to raw all workflow --- .../eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml index 069dd09f7..9d89534f0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml @@ -224,6 +224,7 @@ -fODF -lstore -iclaim + --nameNode${nameNode} @@ -249,6 +250,7 @@ -fOAF -lstore -iclaim + --nameNode${nameNode} From 832a75d0121b898fff6dfa8bc9002ad236e62d96 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Tue, 28 Feb 2023 10:16:34 +0100 Subject: [PATCH 07/16] added mapping for crossref funder --- .../doiboost/crossref/Crossref2Oaf.scala | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index 75aa4a024..4cdbe60f7 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -505,6 +505,23 @@ case object Crossref2Oaf { val targetId = getProjectId("cihr________", "1e5e62235d094afd01cd56e65112fc63") queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) + + case "10.13039/100020031" => + val targetId = getProjectId("tara________", "1e5e62235d094afd01cd56e65112fc63") + queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) + queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) + + + case "10.13039/501100005416" => generateSimpleRelationFromAward(funder, "rcn_________", a => a) + case "10.13039/501100004744" => INNOVIRIS + + + + + + + + case "10.13039/501100002848" => generateSimpleRelationFromAward(funder, "conicytf____", a => a) case "10.13039/501100003448" => generateSimpleRelationFromAward(funder, "gsrt________", extractECAward) case "10.13039/501100010198" => generateSimpleRelationFromAward(funder, "sgov________", a => a) From 69fa61649046f3a4cfa244ddc33d423d8cc91926 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Tue, 28 Feb 2023 10:27:38 +0100 Subject: [PATCH 08/16] removed wrong content --- .../eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala | 9 --------- 1 file changed, 9 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index 4cdbe60f7..8ef564a62 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -513,15 +513,6 @@ case object Crossref2Oaf { case "10.13039/501100005416" => generateSimpleRelationFromAward(funder, "rcn_________", a => a) - case "10.13039/501100004744" => INNOVIRIS - - - - - - - - case "10.13039/501100002848" => generateSimpleRelationFromAward(funder, "conicytf____", a => a) case "10.13039/501100003448" => generateSimpleRelationFromAward(funder, "gsrt________", extractECAward) case "10.13039/501100010198" => generateSimpleRelationFromAward(funder, "sgov________", a => a) From 0559d8b412a83e336ada2e10813b83412bac26fd Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 28 Feb 2023 10:57:32 +0100 Subject: [PATCH 09/16] WIP monodirectional citations --- .../eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index 8ef564a62..7b5f9fb91 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -397,17 +397,7 @@ case object Crossref2Oaf { from.setDataInfo(source.getDataInfo) from.setLastupdatetimestamp(source.getLastupdatetimestamp) - val to = new Relation - to.setTarget(source.getId) - to.setSource(targetId) - to.setRelType(ModelConstants.RESULT_RESULT) - to.setRelClass(ModelConstants.IS_CITED_BY) - to.setSubRelType(ModelConstants.CITATION) - to.setCollectedfrom(source.getCollectedfrom) - to.setDataInfo(source.getDataInfo) - to.setLastupdatetimestamp(source.getLastupdatetimestamp) - - List(from, to) + List(from) } def generateCitationRelations(dois: List[String], result: Result): List[Relation] = { From 2f7346e9cfbba21c1efb69852834d3c9fc066424 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 28 Feb 2023 13:30:51 +0100 Subject: [PATCH 10/16] WIP monodirectional citations, Datacite --- .../dhp/datacite/DataciteModelConstants.scala | 20 ------ .../DataciteToOAFTransformation.scala | 61 +++++++++++++------ 2 files changed, 41 insertions(+), 40 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala index a59779387..6e9d53aa3 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala @@ -78,16 +78,6 @@ object DataciteModelConstants { OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME) val subRelTypeMapping: Map[String, OAFRelations] = Map( - ModelConstants.REFERENCES -> OAFRelations( - ModelConstants.REFERENCES, - ModelConstants.IS_REFERENCED_BY, - ModelConstants.RELATIONSHIP - ), - ModelConstants.IS_REFERENCED_BY -> OAFRelations( - ModelConstants.IS_REFERENCED_BY, - ModelConstants.REFERENCES, - ModelConstants.RELATIONSHIP - ), ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations( ModelConstants.IS_SUPPLEMENTED_BY, ModelConstants.IS_SUPPLEMENT_TO, @@ -163,16 +153,6 @@ object DataciteModelConstants { ModelConstants.IS_SOURCE_OF, ModelConstants.VERSION ), - ModelConstants.CITES -> OAFRelations( - ModelConstants.CITES, - ModelConstants.IS_CITED_BY, - ModelConstants.CITATION - ), - ModelConstants.IS_CITED_BY -> OAFRelations( - ModelConstants.IS_CITED_BY, - ModelConstants.CITES, - ModelConstants.CITATION - ), ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations( ModelConstants.IS_VARIANT_FORM_OF, ModelConstants.IS_DERIVED_FROM, diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala index a7ad9e2d6..049910ade 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala @@ -645,7 +645,7 @@ object DataciteToOAFTransformation { id: String, date: String ): List[Relation] = { - rels + val bidirectionalRels: List[Relation] = rels .filter(r => subRelTypeMapping .contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") || @@ -653,27 +653,48 @@ object DataciteToOAFTransformation { r.relatedIdentifierType.equalsIgnoreCase("arxiv")) ) .map(r => { - val rel = new Relation - rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) - rel.setDataInfo(dataInfo) - val subRelType = subRelTypeMapping(r.relationType).relType - rel.setRelType(REL_TYPE_VALUE) - rel.setSubRelType(subRelType) - rel.setRelClass(r.relationType) - - val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date) - - rel.setProperties(List(dateProps).asJava) - - rel.setSource(id) - rel.setTarget( - DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType) - ) - rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) - rel.getCollectedfrom.asScala.map(c => c.getValue).toList - rel + val target = DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType) + relation(id, target, subRelType, r.relationType, date) }) + val citationRels:List[Relation] = rels + .filter(r => + (r.relatedIdentifierType.equalsIgnoreCase("doi") || + r.relatedIdentifierType.equalsIgnoreCase("pmid") || + r.relatedIdentifierType.equalsIgnoreCase("arxiv")) && + (r.relationType.toLowerCase.contains("cite") || r.relationType.toLowerCase.contains("reference"))) + .map(r => { + r.relationType match { + case ModelConstants.CITES | ModelConstants.REFERENCES => + val target = DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType) + relation(id, target, ModelConstants.CITATION, ModelConstants.CITES, date) + case ModelConstants.IS_CITED_BY | ModelConstants.IS_REFERENCED_BY => + val source = DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType) + relation(source, id, ModelConstants.CITATION, ModelConstants.CITES, date) + } + }) + + citationRels ::: bidirectionalRels + } + + def relation(source:String, target:String, subRelType:String, relClass:String, date:String): Relation = { + val rel = new Relation + rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) + rel.setDataInfo(dataInfo) + + rel.setRelType(REL_TYPE_VALUE) + rel.setSubRelType(subRelType) + rel.setRelClass(relClass) + + val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date) + + rel.setProperties(List(dateProps).asJava) + + rel.setSource(source) + rel.setTarget(target) + rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) + rel.getCollectedfrom.asScala.map(c => c.getValue).toList + rel } def generateDSId(input: String): String = { From ad745c0aa32e75079323370f93030c7bc180c072 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 28 Feb 2023 14:58:27 +0100 Subject: [PATCH 11/16] [CrossrefFunderMapping] fixed issueson funder name --- .../eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index 7b5f9fb91..e78da07e2 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -539,21 +539,21 @@ case object Crossref2Oaf { generateSimpleRelationFromAward(funder, "corda_____he", extractECAward) //FCT case "10.13039/501100001871" => - generateSimpleRelationFromAward(funder, "fct_________", extractECAward) + generateSimpleRelationFromAward(funder, "fct_________", a => a) //NHMRC case "10.13039/501100000925" => - generateSimpleRelationFromAward(funder, "mhmrc_______", extractECAward) + generateSimpleRelationFromAward(funder, "nhmrc_______", a => a) //NIH case "10.13039/100000002" => - generateSimpleRelationFromAward(funder, "nih_________", extractECAward) + generateSimpleRelationFromAward(funder, "nih_________", a => a) //NWO case "10.13039/501100003246" => - generateSimpleRelationFromAward(funder, "nwo_________", extractECAward) + generateSimpleRelationFromAward(funder, "nwo_________", a => a) //UKRI case "10.13039/100014013" | "10.13039/501100000267" | "10.13039/501100000268" | "10.13039/501100000269" | "10.13039/501100000266" | "10.13039/501100006041" | "10.13039/501100000265" | "10.13039/501100000270" | "10.13039/501100013589" | "10.13039/501100000271" => - generateSimpleRelationFromAward(funder, "nwo_________", extractECAward) + generateSimpleRelationFromAward(funder, "ukri________", a => a) case _ => logger.debug("no match for " + funder.DOI.get) @@ -566,10 +566,11 @@ case object Crossref2Oaf { case "European Union's" => generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward) generateSimpleRelationFromAward(funder, "corda_______", extractECAward) + generateSimpleRelationFromAward(funder, "corda_____he", extractECAward) case "The French National Research Agency (ANR)" | "The French National Research Agency" => generateSimpleRelationFromAward(funder, "anr_________", a => a) case "CONICYT, Programa de FormaciĆ³n de Capital Humano Avanzado" => - generateSimpleRelationFromAward(funder, "conicytf____", extractECAward) + generateSimpleRelationFromAward(funder, "conicytf____", a => a) case "Wellcome Trust Masters Fellowship" => generateSimpleRelationFromAward(funder, "wt__________", a => a) val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63") From 9c59dac859901d71e2787192332f0275a879d084 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 1 Mar 2023 10:16:20 +0100 Subject: [PATCH 12/16] followup changes reorganising the mdstore synchronisation mechanism --- .../doiboost/crossref/Crossref2Oaf.scala | 1 - .../oa/graph/raw_all/oozie_app/workflow.xml | 53 ++--- .../raw_claims/oozie_app/config-default.xml | 18 -- .../graph/raw_claims/oozie_app/workflow.xml | 162 --------------- .../graph/raw_db/oozie_app/config-default.xml | 18 -- .../oa/graph/raw_db/oozie_app/workflow.xml | 195 ------------------ .../oozie_app/config-default.xml | 18 -- .../raw_hdfs_stores/oozie_app/workflow.xml | 157 -------------- .../raw_step1/oozie_app/config-default.xml | 18 -- .../oa/graph/raw_step1/oozie_app/workflow.xml | 67 ------ .../raw_step2/oozie_app/config-default.xml | 18 -- .../oa/graph/raw_step2/oozie_app/workflow.xml | 65 ------ .../raw_step3/oozie_app/config-default.xml | 18 -- .../oa/graph/raw_step3/oozie_app/workflow.xml | 60 ------ 14 files changed, 19 insertions(+), 849 deletions(-) delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/workflow.xml delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_db/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_db/oozie_app/workflow.xml delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_hdfs_stores/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_hdfs_stores/oozie_app/workflow.xml delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step1/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step1/oozie_app/workflow.xml delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step2/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step2/oozie_app/workflow.xml delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step3/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step3/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index e78da07e2..9c63b709b 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -501,7 +501,6 @@ case object Crossref2Oaf { queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) - case "10.13039/501100005416" => generateSimpleRelationFromAward(funder, "rcn_________", a => a) case "10.13039/501100002848" => generateSimpleRelationFromAward(funder, "conicytf____", a => a) case "10.13039/501100003448" => generateSimpleRelationFromAward(funder, "gsrt________", extractECAward) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml index 9d89534f0..a47b42c90 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml @@ -214,16 +214,13 @@ - - - eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication - -p${contentPath}/odf_claims - -mongourl${mongoURL} - -mongodb${mongoDb} - -fODF - -lstore - -iclaim + --hdfsPath${contentPath}/mdstore + --mongoBaseUrl${mongoURL} + --mongoDb${mongoDb} + --mdFormatODF + --mdLayoutstore + --mdInterpretationclaim --nameNode${nameNode} @@ -240,16 +237,13 @@ - - - eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication - -p${contentPath}/oaf_claims - -mongourl${mongoURL} - -mongodb${mongoDb} - -fOAF - -lstore - -iclaim + --hdfsPath${contentPath}/mdstore + --mongoBaseUrl${mongoURL} + --mongoDb${mongoDb} + --mdFormatOAF + --mdLayoutstore + --mdInterpretationclaim --nameNode${nameNode} @@ -293,11 +287,8 @@ - - - eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication - --hdfsPath${contentPath}/odf_records + --hdfsPath${contentPath}/mdstore --mongoBaseUrl${mongoURL} --mongoDb${mongoDb} --mdFormatODF @@ -319,11 +310,8 @@ - - - eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication - --hdfsPath${contentPath}/oaf_records + --hdfsPath${contentPath}/mdstore --mongoBaseUrl${mongoURL} --mongoDb${mongoDb} --mdFormatOAF @@ -337,11 +325,8 @@ - - - eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication - --hdfsPath${contentPath}/oaf_records_invisible + --hdfsPath${contentPath}/mdstore --mongoBaseUrl${mongoURL} --mongoDb${mongoDb} --mdFormatOAF @@ -377,7 +362,7 @@ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --hdfsPath${contentPath}/odf_records_hdfs + --hdfsPath${contentPath}/odf_mdstore_hdfs --mdstoreManagerUrl${mdstoreManagerUrl} --mdFormatODF --mdLayoutstore @@ -411,7 +396,7 @@ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --hdfsPath${contentPath}/oaf_records_hdfs + --hdfsPath${contentPath}/oaf_mdstore_hdfs --mdstoreManagerUrl${mdstoreManagerUrl} --mdFormatOAF --mdLayoutstore @@ -544,7 +529,7 @@ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --sourcePaths${contentPath}/db_openaire,${contentPath}/db_openorgs,${contentPath}/oaf_records,${contentPath}/odf_records,${contentPath}/oaf_records_hdfs,${contentPath}/odf_records_hdfs,${contentPath}/oaf_records_invisible + --sourcePaths${contentPath}/db_openaire,${contentPath}/db_openorgs,${contentPath}/oaf_mdstore_hdfs,${contentPath}/odf_mdstore_hdfs,${contentPath}/mdstore/*/* --invalidPath${workingDir}/invalid_records --isLookupUrl${isLookupUrl} @@ -568,7 +553,7 @@ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --sourcePaths${contentPath}/db_openaire,${contentPath}/db_openorgs,${contentPath}/oaf_records,${contentPath}/odf_records,${contentPath}/oaf_records_hdfs,${contentPath}/odf_records_hdfs,${contentPath}/oaf_records_invisible + --sourcePaths${contentPath}/db_openaire,${contentPath}/db_openorgs,${contentPath}/oaf_mdstore_hdfs,${contentPath}/odf_mdstore_hdfs,${contentPath}/mdstore/*/* --targetPath${workingDir}/entities --isLookupUrl${isLookupUrl} --shouldHashId${shouldHashId} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/config-default.xml deleted file mode 100644 index 2e0ed9aee..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/config-default.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/workflow.xml deleted file mode 100644 index 4c319d037..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/workflow.xml +++ /dev/null @@ -1,162 +0,0 @@ - - - - reuseContent - false - should import content from the aggregator or reuse a previous version - - - contentPath - path location to store (or reuse) content from the aggregator - - - postgresURL - the postgres URL to access to the database - - - postgresUser - the user postgres - - - postgresPassword - the password postgres - - - dbSchema - beta - the database schema according to the D-Net infrastructure (beta or production) - - - mongoURL - mongoDB url, example: mongodb://[username:password@]host[:port] - - - mongoDb - mongo database - - - isLookupUrl - the address of the lookUp service - - - nsPrefixBlacklist - - a blacklist of nsprefixes (comma separeted) - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - - - oozieActionShareLibForSpark2 - oozie action sharelib for spark 2.* - - - spark2ExtraListeners - com.cloudera.spark.lineage.NavigatorAppListener - spark 2.* extra listeners classname - - - spark2SqlQueryExecutionListeners - com.cloudera.spark.lineage.NavigatorQueryListener - spark 2.* sql query execution listeners classname - - - spark2YarnHistoryServerAddress - spark 2.* yarn history server address - - - spark2EventLogDir - spark 2.* event log dir location - - - - - ${jobTracker} - ${nameNode} - - - mapreduce.job.queuename - ${queueName} - - - oozie.launcher.mapred.job.queue.name - ${oozieLauncherQueueName} - - - oozie.action.sharelib.for.spark - ${oozieActionShareLibForSpark2} - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - eu.dnetlib.dhp.oa.graph.raw.MigrateDbEntitiesApplication - --hdfsPath${contentPath}/db_claims - --postgresUrl${postgresURL} - --postgresUser${postgresUser} - --postgresPassword${postgresPassword} - --isLookupUrl${isLookupUrl} - --actionclaims - --dbschema${dbSchema} - --nsPrefixBlacklist${nsPrefixBlacklist} - - - - - - - - - - - eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication - -p${contentPath}/odf_claims - -mongourl${mongoURL} - -mongodb${mongoDb} - -fODF - -lstore - -iclaim - - - - - - - - - - - eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication - -p${contentPath}/oaf_claims - -mongourl${mongoURL} - -mongodb${mongoDb} - -fOAF - -lstore - -iclaim - - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_db/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_db/oozie_app/config-default.xml deleted file mode 100644 index 2e0ed9aee..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_db/oozie_app/config-default.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_db/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_db/oozie_app/workflow.xml deleted file mode 100644 index 31b726f39..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_db/oozie_app/workflow.xml +++ /dev/null @@ -1,195 +0,0 @@ - - - - contentPath - path location to store (or reuse) content from the aggregator - - - postgresURL - the postgres URL to access to the database - - - postgresUser - the user postgres - - - postgresPassword - the password postgres - - - dbSchema - beta - the database schema according to the D-Net infrastructure (beta or production) - - - isLookupUrl - the address of the lookUp service - - - nsPrefixBlacklist - - a blacklist of nsprefixes (comma separeted) - - - reuseContent - false - reuse content in the aggregator database - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - - - oozieActionShareLibForSpark2 - oozie action sharelib for spark 2.* - - - spark2ExtraListeners - com.cloudera.spark.lineage.NavigatorAppListener - spark 2.* extra listeners classname - - - spark2SqlQueryExecutionListeners - com.cloudera.spark.lineage.NavigatorQueryListener - spark 2.* sql query execution listeners classname - - - spark2YarnHistoryServerAddress - spark 2.* yarn history server address - - - spark2EventLogDir - spark 2.* event log dir location - - - - - ${jobTracker} - ${nameNode} - - - mapreduce.job.queuename - ${queueName} - - - oozie.launcher.mapred.job.queue.name - ${oozieLauncherQueueName} - - - oozie.action.sharelib.for.spark - ${oozieActionShareLibForSpark2} - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - ${wf:conf('reuseContent') eq false} - ${wf:conf('reuseContent') eq true} - - - - - - - - - - eu.dnetlib.dhp.oa.graph.raw.MigrateDbEntitiesApplication - --hdfsPath${contentPath}/db_records - --postgresUrl${postgresURL} - --postgresUser${postgresUser} - --postgresPassword${postgresPassword} - --isLookupUrl${isLookupUrl} - --actionopenaire - --dbschema${dbSchema} - --nsPrefixBlacklist${nsPrefixBlacklist} - - - - - - - - - - - eu.dnetlib.dhp.oa.graph.raw.MigrateDbEntitiesApplication - --hdfsPath${contentPath}/db_claims - --postgresUrl${postgresURL} - --postgresUser${postgresUser} - --postgresPassword${postgresPassword} - --isLookupUrl${isLookupUrl} - --dbschema${dbSchema} - --actionclaims - --nsPrefixBlacklist${nsPrefixBlacklist} - - - - - - - - yarn - cluster - GenerateEntities - eu.dnetlib.dhp.oa.graph.raw.GenerateEntitiesApplication - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory ${sparkExecutorMemory} - --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - - --sourcePaths${contentPath}/db_records,${contentPath}/db_claims - --targetPath${workingDir}/entities - --isLookupUrl${isLookupUrl} - --shouldHashIdtrue - - - - - - - - yarn - cluster - GenerateGraph - eu.dnetlib.dhp.oa.graph.raw.DispatchEntitiesApplication - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory ${sparkExecutorMemory} - --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - - --sourcePath${workingDir}/entities - --graphRawPath${workingDir}/graph_aggregator - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_hdfs_stores/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_hdfs_stores/oozie_app/config-default.xml deleted file mode 100644 index 2e0ed9aee..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_hdfs_stores/oozie_app/config-default.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_hdfs_stores/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_hdfs_stores/oozie_app/workflow.xml deleted file mode 100644 index bfe2dff0b..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_hdfs_stores/oozie_app/workflow.xml +++ /dev/null @@ -1,157 +0,0 @@ - - - - - graphOutputPath - the target path to store raw graph - - - contentPath - path location to store (or reuse) content from the aggregator - - - mdstoreManagerUrl - the address of the Mdstore Manager - - - isLookupUrl - the address of the lookUp service - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - - - oozieActionShareLibForSpark2 - oozie action sharelib for spark 2.* - - - spark2ExtraListeners - com.cloudera.spark.lineage.NavigatorAppListener - spark 2.* extra listeners classname - - - spark2SqlQueryExecutionListeners - com.cloudera.spark.lineage.NavigatorQueryListener - spark 2.* sql query execution listeners classname - - - spark2YarnHistoryServerAddress - spark 2.* yarn history server address - - - spark2EventLogDir - spark 2.* event log dir location - - - - - ${jobTracker} - ${nameNode} - - - mapreduce.job.queuename - ${queueName} - - - oozie.launcher.mapred.job.queue.name - ${oozieLauncherQueueName} - - - oozie.action.sharelib.for.spark - ${oozieActionShareLibForSpark2} - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - yarn - cluster - ImportODF_hdfs - eu.dnetlib.dhp.oa.graph.raw.MigrateHdfsMdstoresApplication - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory ${sparkExecutorMemory} - --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - - --hdfsPath${contentPath}/odf_records_hdfs - --mdstoreManagerUrl${mdstoreManagerUrl} - --mdFormatODF - --mdLayoutstore - --mdInterpretationcleaned - - - - - - - - yarn - cluster - GenerateEntities - eu.dnetlib.dhp.oa.graph.raw.GenerateEntitiesApplication - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory ${sparkExecutorMemory} - --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - - --sourcePaths${contentPath}/odf_records_hdfs - --targetPath${workingDir}/entities - --isLookupUrl${isLookupUrl} - --shouldHashId${shouldHashId} - - - - - - - - yarn - cluster - GenerateGraph - eu.dnetlib.dhp.oa.graph.raw.DispatchEntitiesApplication - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory ${sparkExecutorMemory} - --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - - --sourcePath${workingDir}/entities - --graphRawPath${workingDir}/graph_raw - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step1/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step1/oozie_app/config-default.xml deleted file mode 100644 index 2e0ed9aee..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step1/oozie_app/config-default.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step1/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step1/oozie_app/workflow.xml deleted file mode 100644 index c57371560..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step1/oozie_app/workflow.xml +++ /dev/null @@ -1,67 +0,0 @@ - - - - migrationPathStep1 - the base path to store hdfs file - - - mongoURL - mongoDB url, example: mongodb://[username:password@]host[:port] - - - mongoDb - mongo database - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication - -p${migrationPathStep1} - -mongourl${mongoURL} - -mongodb${mongoDb} - -fODF - -lstore - -icleaned - --nameNode${nameNode} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication - -p${migrationPathStep1} - -mongourl${mongoURL} - -mongodb${mongoDb} - -fOAF - -lstore - -icleaned - --nameNode${nameNode} - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step2/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step2/oozie_app/config-default.xml deleted file mode 100644 index 2e0ed9aee..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step2/oozie_app/config-default.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step2/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step2/oozie_app/workflow.xml deleted file mode 100644 index f6485ea9c..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step2/oozie_app/workflow.xml +++ /dev/null @@ -1,65 +0,0 @@ - - - - migrationPathStep1 - the base path to store hdfs file - - - migrationPathStep2 - the temporary path to store entities before dispatching - - - isLookupUrl - the address of the lookUp service - - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - GenerateEntities - eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication - dhp-aggregation-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" - -mt yarn-cluster - -s${migrationPathStep1}/db_records,${migrationPathStep1}/oaf_records,${migrationPathStep1}/odf_records - -t${migrationPathStep2}/all_entities - --islookup${isLookupUrl} - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step3/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step3/oozie_app/config-default.xml deleted file mode 100644 index 2e0ed9aee..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step3/oozie_app/config-default.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step3/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step3/oozie_app/workflow.xml deleted file mode 100644 index 8688f09d1..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step3/oozie_app/workflow.xml +++ /dev/null @@ -1,60 +0,0 @@ - - - - - migrationPathStep2 - the temporary path to store entities before dispatching - - - migrationPathStep3 - the graph Raw base path - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - GenerateGraph - eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication - dhp-aggregation-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" - -mt yarn-cluster - -s${migrationPathStep2}/all_entities - -g${migrationPathStep3} - - - - - - - \ No newline at end of file From 16ad42e8f3dc9422dba71975aa52c5d662af73aa Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 1 Mar 2023 10:22:13 +0100 Subject: [PATCH 13/16] code formatting --- .../dnetlib/dhp/datacite/DataciteToOAFTransformation.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala index 049910ade..45f5f9729 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala @@ -657,12 +657,13 @@ object DataciteToOAFTransformation { val target = DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType) relation(id, target, subRelType, r.relationType, date) }) - val citationRels:List[Relation] = rels + val citationRels: List[Relation] = rels .filter(r => (r.relatedIdentifierType.equalsIgnoreCase("doi") || r.relatedIdentifierType.equalsIgnoreCase("pmid") || r.relatedIdentifierType.equalsIgnoreCase("arxiv")) && - (r.relationType.toLowerCase.contains("cite") || r.relationType.toLowerCase.contains("reference"))) + (r.relationType.toLowerCase.contains("cite") || r.relationType.toLowerCase.contains("reference")) + ) .map(r => { r.relationType match { case ModelConstants.CITES | ModelConstants.REFERENCES => @@ -677,7 +678,7 @@ object DataciteToOAFTransformation { citationRels ::: bidirectionalRels } - def relation(source:String, target:String, subRelType:String, relClass:String, date:String): Relation = { + def relation(source: String, target: String, subRelType: String, relClass: String, date: String): Relation = { val rel = new Relation rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) rel.setDataInfo(dataInfo) From 7d263f265e9fdde4fd8870a136aa352b47a506c6 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 1 Mar 2023 11:58:07 +0100 Subject: [PATCH 14/16] adjusted logs --- .../dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java index 4949cc627..6f0adc75a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java @@ -131,7 +131,7 @@ public class MigrateMongoMdstoresApplication extends AbstractMigrationApplicatio // that is the hdfs path basePath/MDSTOREID/timestamp is missing // So we have to synch it if (!hdfsMDStores.containsKey(currentMDStore.getMdstore())) { - log.info("Adding store " + currentMDStore.getMdstore()); + log.info("Adding store {}", currentMDStore.getMdstore()); try { synchMDStoreIntoHDFS( mdFormat, mdLayout, mdInterpretation, hdfsPath, fileSystem, mongoBaseUrl, mongoDb, @@ -145,14 +145,14 @@ public class MigrateMongoMdstoresApplication extends AbstractMigrationApplicatio // basePath/MDSTOREID/timestamp but the timestamp on hdfs is older that the // new one in mongo so we have to synch the new mdstore and delete the old one if (currentMDStore.getLatestTimestamp() > current.getLatestTimestamp()) { - log.info("Updating MDStore " + currentMDStore.getMdstore()); + log.info("Updating MDStore {}", currentMDStore.getMdstore()); final String mdstoreDir = createMDStoreDir(hdfsPath, currentMDStore.getMdstore()); final String rmPath = createMDStoreDir(mdstoreDir, current.getLatestTimestamp().toString()); try { synchMDStoreIntoHDFS( mdFormat, mdLayout, mdInterpretation, hdfsPath, fileSystem, mongoBaseUrl, mongoDb, currentMDStore); - log.info("deleting " + rmPath); + log.info("deleting {}", rmPath); // DELETE THE OLD MDSTORE fileSystem.delete(new Path(rmPath), true); } catch (IOException e) { From 6f488547a7b318740666b16a8f4184658ff3cd55 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 1 Mar 2023 14:49:51 +0100 Subject: [PATCH 15/16] ignore non processable records --- .../oa/graph/raw/MigrateHdfsMdstoresApplication.java | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateHdfsMdstoresApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateHdfsMdstoresApplication.java index ab6f54b92..f1f59b398 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateHdfsMdstoresApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateHdfsMdstoresApplication.java @@ -6,11 +6,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.IOException; import java.io.StringReader; import java.text.SimpleDateFormat; -import java.util.Arrays; -import java.util.Date; -import java.util.Optional; -import java.util.Set; -import java.util.UUID; +import java.util.*; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; @@ -24,6 +20,7 @@ import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Row; @@ -110,6 +107,7 @@ public class MigrateHdfsMdstoresApplication extends AbstractMigrationApplication .read() .parquet(validPaths) .map((MapFunction) MigrateHdfsMdstoresApplication::enrichRecord, Encoders.STRING()) + .filter((FilterFunction) Objects::nonNull) .toJavaRDD() .mapToPair(xml -> new Tuple2<>(new Text(UUID.randomUUID() + ":" + type), new Text(xml))) // .coalesce(1) @@ -135,13 +133,14 @@ public class MigrateHdfsMdstoresApplication extends AbstractMigrationApplication reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); final Document doc = reader.read(new StringReader(xml)); final Element head = (Element) doc.selectSingleNode("//*[local-name() = 'header']"); + head.addElement(new QName("objIdentifier", DRI_NS_PREFIX)).addText(r.getAs("id")); head.addElement(new QName("dateOfCollection", DRI_NS_PREFIX)).addText(collDate); head.addElement(new QName("dateOfTransformation", DRI_NS_PREFIX)).addText(tranDate); return doc.asXML(); } catch (final Exception e) { log.error("Error patching record: " + xml); - throw new RuntimeException("Error patching record: " + xml, e); + return null; } } From db9dad4aa783d99e937ab946f3c1d0479ceb83d8 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 2 Mar 2023 09:11:37 +0100 Subject: [PATCH 16/16] [actionmanager] increased spark.sql.shuffle.partitions for publication, dataset, relation records --- .../dhp/actionmanager/wf/dataset/oozie_app/workflow.xml | 4 ++-- .../dhp/actionmanager/wf/publication/oozie_app/workflow.xml | 4 ++-- .../dhp/actionmanager/wf/relation/oozie_app/workflow.xml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml index 4dc250c29..4f374a75a 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml @@ -107,7 +107,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=2560 + --conf spark.sql.shuffle.partitions=7000 --inputGraphTablePath${inputGraphRootPath}/dataset --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset @@ -159,7 +159,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=2560 + --conf spark.sql.shuffle.partitions=7000 --inputGraphTablePath${workingDir}/dataset --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml index b1c8e7c85..b76dc82f1 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml @@ -107,7 +107,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=5000 + --conf spark.sql.shuffle.partitions=7000 --inputGraphTablePath${inputGraphRootPath}/publication --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication @@ -159,7 +159,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=5000 + --conf spark.sql.shuffle.partitions=7000 --inputGraphTablePath${workingDir}/publication --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml index 20ffe26d3..d3086dbdc 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml @@ -99,7 +99,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=5000 + --conf spark.sql.shuffle.partitions=10000 --inputGraphTablePath${inputGraphRootPath}/relation --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Relation