From ed052a3476bf5c8980412b0d1b8387491d761ab2 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Mon, 6 May 2024 16:08:33 +0200 Subject: [PATCH 01/24] job for the population of the oai database --- .../dhp/oa/oaipmh/IrishOaiExporterJob.java | 156 ++++++++++++++++++ .../dhp/oa/oaipmh/OaiRecordWrapper.java | 50 ++++++ 2 files changed, 206 insertions(+) create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/OaiRecordWrapper.java diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java new file mode 100644 index 000000000..9a608b6fa --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java @@ -0,0 +1,156 @@ +package eu.dnetlib.dhp.oa.oaipmh; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.Optional; +import java.util.Properties; +import java.util.zip.GZIPOutputStream; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.dom4j.Document; +import org.dom4j.DocumentHelper; +import org.dom4j.Node; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.oa.provision.XmlConverterJob; +import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument; +import eu.dnetlib.dhp.oa.provision.model.TupleWrapper; + +public class IrishOaiExporterJob { + + private static final Logger log = LoggerFactory.getLogger(IrishOaiExporterJob.class); + + protected static final int NUM_CONNECTIONS = 20; + + public static void main(final String[] args) throws Exception { + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString(XmlConverterJob.class + .getResourceAsStream("/eu/dnetlib/dhp/oa/oaipmh/input_params_irish_oai_exporter.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String inputPath = parser.get("inputPath"); + final String dbUrl = parser.get("dbUrl"); + final String dbTable = parser.get("dbTable"); + final String dbUser = parser.get("dbUser"); + final String dbPwd = parser.get("dbPwd"); + final int numConnections = Optional + .ofNullable(parser.get("numConnections")) + .map(Integer::valueOf) + .orElse(NUM_CONNECTIONS); + + log.info("inputPath: '{}'", inputPath); + log.info("dbUrl: '{}'", dbUrl); + log.info("dbUser: '{}'", dbUser); + log.info("table: '{}'", dbTable); + log.info("dbPwd: '{}'", "xxx"); + log.info("numPartitions: '{}'", numConnections); + + final Properties connectionProperties = new Properties(); + connectionProperties.put("user", dbUser); + connectionProperties.put("password", dbPwd); + + final SparkConf conf = new SparkConf(); + conf.registerKryoClasses(new Class[] { + SerializableSolrInputDocument.class + }); + + final Encoder encoderTuple = Encoders.bean(TupleWrapper.class); + final Encoder encoderOaiRecord = Encoders.bean(OaiRecordWrapper.class); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + final Dataset docs = spark + .read() + .schema(encoderTuple.schema()) + .json(inputPath) + .as(encoderTuple) + .map((MapFunction) TupleWrapper::getXml, Encoders.STRING()) + .map((MapFunction) IrishOaiExporterJob::asIrishOaiResult, encoderOaiRecord) + .filter((FilterFunction) obj -> (obj != null) && StringUtils.isNotBlank(obj.getId())); + + docs.repartition(numConnections) + .write() + .mode(SaveMode.Overwrite) + .jdbc(dbUrl, dbTable, connectionProperties); + + }); + } + + private static OaiRecordWrapper asIrishOaiResult(final String xml) { + try { + final Document doc = DocumentHelper.parseText(xml); + final OaiRecordWrapper r = new OaiRecordWrapper(); + + if (isValid(doc)) { + r.setId(doc.valueOf("//*[local-name()='objIdentifier']").trim()); + r.setBody(gzip(xml)); + r.setDate(LocalDateTime.now()); + r.setSets(new ArrayList<>()); + } + return r; + } catch (final Exception e) { + log.error("Error parsing record: " + xml, e); + throw new RuntimeException("Error parsing record: " + xml, e); + } + } + + private static boolean isValid(final Document doc) { + + final Node n = doc.selectSingleNode("//*[local-name()='entity']/*[local-name()='result']"); + + if (n != null) { + for (final Object o : n.selectNodes(".//*[local-name()='datainfo']/*[local-name()='deletedbyinference']")) { + if ("true".equals(((Node) o).getText().trim())) { return false; } + } + + for (final Object o : n.selectNodes("./*[local-name()='country']")) { + if ("IE".equals(((Node) o).valueOf("@classid").trim())) { return true; } + } + + for (final Object o : n.selectNodes(".//*[local-name()='rel']")) { + final String relType = ((Node) o).valueOf("./[local-name() = 'to']/@type").trim(); + final String relCountry = ((Node) o).valueOf("./*[local-name() = 'country']/@classid").trim(); + if ("organization".equals(relType) && "IE".equals(relCountry)) { return true; } + } + } + return false; + + } + + private static byte[] gzip(final String str) { + if (StringUtils.isBlank(str)) { return null; } + + try { + final ByteArrayOutputStream obj = new ByteArrayOutputStream(); + final GZIPOutputStream gzip = new GZIPOutputStream(obj); + gzip.write(str.getBytes("UTF-8")); + gzip.flush(); + gzip.close(); + return obj.toByteArray(); + } catch (final IOException e) { + throw new RuntimeException("error in gzip", e); + } + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/OaiRecordWrapper.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/OaiRecordWrapper.java new file mode 100644 index 000000000..4c2766754 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/OaiRecordWrapper.java @@ -0,0 +1,50 @@ +package eu.dnetlib.dhp.oa.oaipmh; + +import java.io.Serializable; +import java.time.LocalDateTime; +import java.util.List; + +public class OaiRecordWrapper implements Serializable { + + private static final long serialVersionUID = 8997046455575004880L; + + private String id; + private byte[] body; + private LocalDateTime date; + private List sets; + + public OaiRecordWrapper() {} + + public String getId() { + return this.id; + } + + public void setId(final String id) { + this.id = id; + } + + public byte[] getBody() { + return this.body; + } + + public void setBody(final byte[] body) { + this.body = body; + } + + public LocalDateTime getDate() { + return this.date; + } + + public void setDate(final LocalDateTime date) { + this.date = date; + } + + public List getSets() { + return this.sets; + } + + public void setSets(final List sets) { + this.sets = sets; + } + +} From aa40e53c19acf6c8007b7819bea3e65ba642e057 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 7 May 2024 08:01:19 +0200 Subject: [PATCH 02/24] oai exporter parameters --- .../input_params_irish_oai_exporter.json | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/oaipmh/input_params_irish_oai_exporter.json diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/oaipmh/input_params_irish_oai_exporter.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/oaipmh/input_params_irish_oai_exporter.json new file mode 100644 index 000000000..99a12927b --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/oaipmh/input_params_irish_oai_exporter.json @@ -0,0 +1,38 @@ +[ + { + "paramName": "i", + "paramLongName": "inputPath", + "paramDescription": "The path of the input records on HDFS", + "paramRequired": true + }, + { + "paramName": "nc", + "paramLongName": "numConnections", + "paramDescription": "number of connections to the postgres db (for the write operation)", + "paramRequired": false + }, + { + "paramName": "du", + "paramLongName": "dbUrl", + "paramDescription": "the url of the database", + "paramRequired": true + }, + { + "paramName": "dusr", + "paramLongName": "dbUser", + "paramDescription": "the user of the database", + "paramRequired": true + }, + { + "paramName": "t", + "paramLongName": "dbTable", + "paramDescription": "the name of the table in the database", + "paramRequired": true + }, + { + "paramName": "dpwd", + "paramLongName": "dbPwd", + "paramDescription": "the password for the user of the database", + "paramRequired": true + } +] From 70bf6ac41561d487109a04ef60b0659a8785d989 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 7 May 2024 09:36:26 +0200 Subject: [PATCH 03/24] oai exporter tests --- .../dhp/oa/oaipmh/IrishOaiExporterJob.java | 11 ++- .../oa/oaipmh/IrishOaiExporterJobTest.java | 93 +++++++++++++++++++ .../eu/dnetlib/dhp/oa/oaipmh/record_IE.xml | 89 ++++++++++++++++++ .../dhp/oa/oaipmh/record_IE_deleted.xml | 89 ++++++++++++++++++ .../eu/dnetlib/dhp/oa/oaipmh/record_IT.xml | 66 +++++++++++++ 5 files changed, 344 insertions(+), 4 deletions(-) create mode 100644 dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJobTest.java create mode 100644 dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/oaipmh/record_IE.xml create mode 100644 dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/oaipmh/record_IE_deleted.xml create mode 100644 dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/oaipmh/record_IT.xml diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java index 9a608b6fa..e2ae890e5 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java @@ -98,7 +98,7 @@ public class IrishOaiExporterJob { }); } - private static OaiRecordWrapper asIrishOaiResult(final String xml) { + protected static OaiRecordWrapper asIrishOaiResult(final String xml) { try { final Document doc = DocumentHelper.parseText(xml); final OaiRecordWrapper r = new OaiRecordWrapper(); @@ -116,21 +116,24 @@ public class IrishOaiExporterJob { } } - private static boolean isValid(final Document doc) { + protected static boolean isValid(final Document doc) { final Node n = doc.selectSingleNode("//*[local-name()='entity']/*[local-name()='result']"); if (n != null) { + for (final Object o : n.selectNodes(".//*[local-name()='datainfo']/*[local-name()='deletedbyinference']")) { if ("true".equals(((Node) o).getText().trim())) { return false; } } + // verify the main country of the result for (final Object o : n.selectNodes("./*[local-name()='country']")) { if ("IE".equals(((Node) o).valueOf("@classid").trim())) { return true; } } + // verify the countries of the related organizations for (final Object o : n.selectNodes(".//*[local-name()='rel']")) { - final String relType = ((Node) o).valueOf("./[local-name() = 'to']/@type").trim(); + final String relType = ((Node) o).valueOf("./*[local-name() = 'to']/@type").trim(); final String relCountry = ((Node) o).valueOf("./*[local-name() = 'country']/@classid").trim(); if ("organization".equals(relType) && "IE".equals(relCountry)) { return true; } } @@ -139,7 +142,7 @@ public class IrishOaiExporterJob { } - private static byte[] gzip(final String str) { + protected static byte[] gzip(final String str) { if (StringUtils.isBlank(str)) { return null; } try { diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJobTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJobTest.java new file mode 100644 index 000000000..6140b0907 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJobTest.java @@ -0,0 +1,93 @@ +package eu.dnetlib.dhp.oa.oaipmh; + +import static org.junit.Assert.assertNull; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.zip.GZIPInputStream; + +import org.apache.commons.io.IOUtils; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.io.SAXReader; +import org.junit.jupiter.api.Test; + +class IrishOaiExporterJobTest { + + @Test + void testAsIrishOaiResult() throws Exception { + final String xml = IOUtils.toString(getClass().getResourceAsStream("record_IE.xml")); + final OaiRecordWrapper res = IrishOaiExporterJob.asIrishOaiResult(xml); + assertNotNull(res.getId()); + assertNotNull(res.getBody()); + assertNotNull(res.getSets()); + assertNotNull(res.getDate()); + assertEquals("dedup_wf_002::532be02f990b479a1da46d71f1a4c3f0", res.getId()); + assertTrue(res.getBody().length > 0); + assertTrue(res.getSets().isEmpty()); + } + + @Test + void testIsValid_IE() throws DocumentException { + final Document doc = new SAXReader().read(getClass().getResourceAsStream("record_IE.xml")); + assertTrue(IrishOaiExporterJob.isValid(doc)); + } + + @Test + void testIsValid_invalid_country() throws DocumentException { + final Document doc = new SAXReader().read(getClass().getResourceAsStream("record_IT.xml")); + assertFalse(IrishOaiExporterJob.isValid(doc)); + } + + @Test + void testIsValid_deleted() throws DocumentException { + final Document doc = new SAXReader().read(getClass().getResourceAsStream("record_IE_deleted.xml")); + assertFalse(IrishOaiExporterJob.isValid(doc)); + } + + @Test + void testGzip_simple() { + final String message = ""; + final byte[] bytes = IrishOaiExporterJob.gzip(message); + assertNotNull(bytes); + assertTrue(bytes.length > 0); + assertEquals(message, decompress(bytes)); + } + + @Test + void testGzip_empty() { + assertNull(IrishOaiExporterJob.gzip("")); + assertNull(IrishOaiExporterJob.gzip(null)); + } + + private static String decompress(final byte[] compressed) { + final StringBuilder outStr = new StringBuilder(); + if ((compressed == null) || (compressed.length == 0)) { return null; } + try { + if (isCompressed(compressed)) { + final GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(compressed)); + final BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(gis, "UTF-8")); + + String line; + while ((line = bufferedReader.readLine()) != null) { + outStr.append(line); + } + } else { + outStr.append(compressed); + } + return outStr.toString(); + } catch (final IOException e) { + throw new RuntimeException("error in gunzip", e); + } + } + + private static boolean isCompressed(final byte[] compressed) { + return (compressed[0] == (byte) GZIPInputStream.GZIP_MAGIC) && (compressed[1] == (byte) (GZIPInputStream.GZIP_MAGIC >> 8)); + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/oaipmh/record_IE.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/oaipmh/record_IE.xml new file mode 100644 index 000000000..01b7334f8 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/oaipmh/record_IE.xml @@ -0,0 +1,89 @@ + + +
+ dedup_wf_002::532be02f990b479a1da46d71f1a4c3f0 + 2023-03-31T18:37:45.599Z + 2023-03-31T18:45:52.701Z +
+ + + + + + 50|od______6005::55a12e2e0fee45ce8005633c6c17fe9f + oai:repository.wit.ie:3029 + 50|od_______934::e7162a5632264cd622ee7180ca66fdce + oai:generic.eprints.org:3029 + 50|od_______934::55a12e2e0fee45ce8005633c6c17fe9f + + + + + + http://repository.wit.ie/3029/1/Research%20Day%202015%20-%20Poster%20Tadhg%20Blommerde.pdf + A service innovation capability maturity model for SMEs + + Blommerde, Tadhg + Lynch, Patrick + + 2015-04-28 + There is general consensus that service innovations are prerequisite to sustained competitive advantage and are an essential mechanism for responding to changes in customer needs and the operating environment of firms (Giannopoulou et al., 2011; Stryja et al., 2013). Services have been described as ubiquitous in their role of generating economic growth and wellbeing and represent over 70% of employment and GDP in developed nations (Janssen et al., 2012; Mustak, 2014). As a consequence, service innovations must be a core ambition of all countries, regions, and firms wishing to remain competitive (van Ark et al., 2003). While acknowledging the importance of once-off innovations, more critical still is the capability to repeatedly introduce and exploit service innovations (Siguaw et al., 2006). This is generally referred to as service innovation capability (SIC) and describes the repeatable routines and behaviours that organisations have in place to transform ideas and knowledge into innovations (Basterretxea and Martínez, 2012). However, despite links between SIC and continuous, sustainable, and consistent service innovations, there is evidence that many organisations struggle with its effective management (Adams et al., 2006; den Hertog et al., 2010). This is often attributed to the lack of formal guidance available and the absence of metrics to determine an organisation’s SIC performance (Hogan et al., 2011; Szczygielski, 2011). Maturity modelling research in this discipline remains at an embryonic stage, thus far presenting only conceptual and opaque discussions that fail to address the necessity for an assessment and strategic management framework (Gryszkiewicz et al., 2013; Hipp and Grupp, 2005). Therefore, the purpose of this ongoing research project is to evaluate the maturity of an organisation’s SIC to inform its effective management and enhancement. To achieve this it dimensionalises the concept into four constituent capabilities, specifically, strategising, customer involvement, knowledge management, and networking (Blommerde and Lynch, 2014). The study then tracks the maturity of these capabilities as they progress through eight evolutionary plateaus towards a fully developed or optimal state. This is accomplished through a capability maturity model that enables organisations to rapidly diagnose key areas of strength and weakness to systematically cultivate behaviours that leverage their untapped innovative potential (Wendler, 2012; Essmann and du Preez, 2010). As a result of the immense knowledge vacuum characteristic of this discipline, it is anticipated that this ongoing research project will make a substantial contribution to both academic understanding and take strides towards filling the void in practical support (Rapaccini et al., 2013). It expands the service innovation literature by detailing key service innovation levers, bolsters the discipline through clear definitions of terminology, provides a powerful explanation of the development of SICs, and operationalises the dynamic capabilities view through a novel self-assessment reference model (Jochem et al., 2011). The next step in the project is the evaluation of the, as yet, conceptual service innovation capability maturity model. Adopting a positivistic philosophical stance, the study proposes the use of structural equation modelling on data gathered through an extensive survey to confirm the model and support theoretical assumptions. + RIKON (Research in Inovation, Knowledge & Organisational Networks) + + application/pdf + + + false + false + true + + + + true + false + 0.8 + dedup-result-decisiontree-v4 + + + + + openorgs____::54cd984fc7d3b153ec2181f985041f02 + + WIT + South East Technological University + + + + + A service innovation capability maturity model for SMEs + 2015-04-28 + + + + A service innovation capability maturity model for SMEs + 2015-04-28 + + + + A service innovation capability maturity model for SMEs + 2015-04-28 + + + + + + + 2015-04-28 + + + http://repository.wit.ie/3029/1/Research%20Day%202015%20-%20Poster%20Tadhg%20Blommerde.pdf + + http://repository.wit.ie/3029/ + + + + + + +
+
\ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/oaipmh/record_IE_deleted.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/oaipmh/record_IE_deleted.xml new file mode 100644 index 000000000..00d225aa5 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/oaipmh/record_IE_deleted.xml @@ -0,0 +1,89 @@ + + +
+ dedup_wf_002::532be02f990b479a1da46d71f1a4c3f0 + 2023-03-31T18:37:45.599Z + 2023-03-31T18:45:52.701Z +
+ + + + + + 50|od______6005::55a12e2e0fee45ce8005633c6c17fe9f + oai:repository.wit.ie:3029 + 50|od_______934::e7162a5632264cd622ee7180ca66fdce + oai:generic.eprints.org:3029 + 50|od_______934::55a12e2e0fee45ce8005633c6c17fe9f + + + + + + http://repository.wit.ie/3029/1/Research%20Day%202015%20-%20Poster%20Tadhg%20Blommerde.pdf + A service innovation capability maturity model for SMEs + + Blommerde, Tadhg + Lynch, Patrick + + 2015-04-28 + There is general consensus that service innovations are prerequisite to sustained competitive advantage and are an essential mechanism for responding to changes in customer needs and the operating environment of firms (Giannopoulou et al., 2011; Stryja et al., 2013). Services have been described as ubiquitous in their role of generating economic growth and wellbeing and represent over 70% of employment and GDP in developed nations (Janssen et al., 2012; Mustak, 2014). As a consequence, service innovations must be a core ambition of all countries, regions, and firms wishing to remain competitive (van Ark et al., 2003). While acknowledging the importance of once-off innovations, more critical still is the capability to repeatedly introduce and exploit service innovations (Siguaw et al., 2006). This is generally referred to as service innovation capability (SIC) and describes the repeatable routines and behaviours that organisations have in place to transform ideas and knowledge into innovations (Basterretxea and Martínez, 2012). However, despite links between SIC and continuous, sustainable, and consistent service innovations, there is evidence that many organisations struggle with its effective management (Adams et al., 2006; den Hertog et al., 2010). This is often attributed to the lack of formal guidance available and the absence of metrics to determine an organisation’s SIC performance (Hogan et al., 2011; Szczygielski, 2011). Maturity modelling research in this discipline remains at an embryonic stage, thus far presenting only conceptual and opaque discussions that fail to address the necessity for an assessment and strategic management framework (Gryszkiewicz et al., 2013; Hipp and Grupp, 2005). Therefore, the purpose of this ongoing research project is to evaluate the maturity of an organisation’s SIC to inform its effective management and enhancement. To achieve this it dimensionalises the concept into four constituent capabilities, specifically, strategising, customer involvement, knowledge management, and networking (Blommerde and Lynch, 2014). The study then tracks the maturity of these capabilities as they progress through eight evolutionary plateaus towards a fully developed or optimal state. This is accomplished through a capability maturity model that enables organisations to rapidly diagnose key areas of strength and weakness to systematically cultivate behaviours that leverage their untapped innovative potential (Wendler, 2012; Essmann and du Preez, 2010). As a result of the immense knowledge vacuum characteristic of this discipline, it is anticipated that this ongoing research project will make a substantial contribution to both academic understanding and take strides towards filling the void in practical support (Rapaccini et al., 2013). It expands the service innovation literature by detailing key service innovation levers, bolsters the discipline through clear definitions of terminology, provides a powerful explanation of the development of SICs, and operationalises the dynamic capabilities view through a novel self-assessment reference model (Jochem et al., 2011). The next step in the project is the evaluation of the, as yet, conceptual service innovation capability maturity model. Adopting a positivistic philosophical stance, the study proposes the use of structural equation modelling on data gathered through an extensive survey to confirm the model and support theoretical assumptions. + RIKON (Research in Inovation, Knowledge & Organisational Networks) + + application/pdf + + + false + false + true + + + + true + true + 0.8 + dedup-result-decisiontree-v4 + + + + + openorgs____::54cd984fc7d3b153ec2181f985041f02 + + WIT + South East Technological University + + + + + A service innovation capability maturity model for SMEs + 2015-04-28 + + + + A service innovation capability maturity model for SMEs + 2015-04-28 + + + + A service innovation capability maturity model for SMEs + 2015-04-28 + + + + + + + 2015-04-28 + + + http://repository.wit.ie/3029/1/Research%20Day%202015%20-%20Poster%20Tadhg%20Blommerde.pdf + + http://repository.wit.ie/3029/ + + + + + + +
+
\ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/oaipmh/record_IT.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/oaipmh/record_IT.xml new file mode 100644 index 000000000..7649589d1 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/oaipmh/record_IT.xml @@ -0,0 +1,66 @@ + + +
+ od_______310::02365c51a0ed7cbb54b2bbc7c0426d1b + 2024-04-06T06:05:16+0000 + 2024-04-06T06:56:01.776Z +
+ + + + + 50|od_______310::02365c51a0ed7cbb54b2bbc7c0426d1b + oai:flore.unifi.it:2158/608965 + 2158/608965 + + + + + + Estorsione (art. 629) + + MACRI', FRANCESCO + + 2011-01-01 + + 2011-01-01 + 2011-01-01 + 2015-04-28 + UTET + + + + false + false + 0.9 + null + + + + + openorgs____::41406edad82942e9e0b29317b8a847e2 + University of Florence + + University of Florence + + + + + + + + 2011-01-01 + + 2158/608965 + http://hdl.handle.net/2158/608965 + + + https://hdl.handle.net/2158/608965 + + + + + + +
+
\ No newline at end of file From e234848af8b0a313a0c8b3988d2ceb4f425edc78 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Wed, 8 May 2024 10:00:53 +0200 Subject: [PATCH 04/24] oaf record: xpath for root --- .../main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java index e2ae890e5..fff5d015d 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java @@ -105,7 +105,7 @@ public class IrishOaiExporterJob { if (isValid(doc)) { r.setId(doc.valueOf("//*[local-name()='objIdentifier']").trim()); - r.setBody(gzip(xml)); + r.setBody(gzip(doc.selectSingleNode("//*[local-name()='entity']").asXML())); r.setDate(LocalDateTime.now()); r.setSets(new ArrayList<>()); } From c9a327bc5094f48c08f4d7d0b3274378d8d8c63f Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Wed, 8 May 2024 11:34:08 +0200 Subject: [PATCH 05/24] refactoring of gzip method --- .../dhp/oa/oaipmh/IrishOaiExporterJob.java | 14 +++++------ .../oa/oaipmh/IrishOaiExporterJobTest.java | 24 +++++-------------- 2 files changed, 13 insertions(+), 25 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java index fff5d015d..b59f0ae73 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java @@ -4,6 +4,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.nio.charset.Charset; import java.time.LocalDateTime; import java.util.ArrayList; import java.util.Optional; @@ -145,15 +146,14 @@ public class IrishOaiExporterJob { protected static byte[] gzip(final String str) { if (StringUtils.isBlank(str)) { return null; } - try { - final ByteArrayOutputStream obj = new ByteArrayOutputStream(); - final GZIPOutputStream gzip = new GZIPOutputStream(obj); - gzip.write(str.getBytes("UTF-8")); - gzip.flush(); - gzip.close(); - return obj.toByteArray(); + try (final ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + try (final GZIPOutputStream gzip = new GZIPOutputStream(baos)) { + IOUtils.write(str.getBytes(Charset.defaultCharset()), gzip); + } + return baos.toByteArray(); } catch (final IOException e) { throw new RuntimeException("error in gzip", e); } } + } diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJobTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJobTest.java index 6140b0907..e33c701c5 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJobTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJobTest.java @@ -6,10 +6,9 @@ import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; -import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.IOException; -import java.io.InputStreamReader; +import java.nio.charset.Charset; import java.util.zip.GZIPInputStream; import org.apache.commons.io.IOUtils; @@ -57,7 +56,7 @@ class IrishOaiExporterJobTest { final byte[] bytes = IrishOaiExporterJob.gzip(message); assertNotNull(bytes); assertTrue(bytes.length > 0); - assertEquals(message, decompress(bytes)); + assertEquals(message, gunzip(bytes)); } @Test @@ -66,22 +65,11 @@ class IrishOaiExporterJobTest { assertNull(IrishOaiExporterJob.gzip(null)); } - private static String decompress(final byte[] compressed) { - final StringBuilder outStr = new StringBuilder(); + public static String gunzip(final byte[] compressed) { if ((compressed == null) || (compressed.length == 0)) { return null; } - try { - if (isCompressed(compressed)) { - final GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(compressed)); - final BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(gis, "UTF-8")); - - String line; - while ((line = bufferedReader.readLine()) != null) { - outStr.append(line); - } - } else { - outStr.append(compressed); - } - return outStr.toString(); + if (!isCompressed(compressed)) { return new String(compressed); } + try (final GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(compressed))) { + return IOUtils.toString(gis, Charset.defaultCharset()); } catch (final IOException e) { throw new RuntimeException("error in gunzip", e); } From 2b3b5fe9a172bb1fafb4815a5c52aa9fcaff6644 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Wed, 15 May 2024 14:13:16 +0200 Subject: [PATCH 06/24] oai finalization and test --- .../dhp/oa/oaipmh/IrishOaiExporterJob.java | 26 ++++- .../dhp/oa/oaipmh/OaiRecordWrapper.java | 11 ++- .../input_params_irish_oai_exporter.json | 6 -- .../eu/dnetlib/dhp/oa/oaipmh/oai-finalize.sql | 12 +++ .../dhp/oa/oaipmh/DbSerializationTest.java | 97 +++++++++++++++++++ .../oa/oaipmh/IrishOaiExporterJobTest.java | 14 ++- 6 files changed, 146 insertions(+), 20 deletions(-) create mode 100644 dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/oaipmh/oai-finalize.sql create mode 100644 dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/DbSerializationTest.java diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java index b59f0ae73..433baf272 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.oa.oaipmh; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; @@ -5,6 +6,9 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.nio.charset.Charset; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.Statement; import java.time.LocalDateTime; import java.util.ArrayList; import java.util.Optional; @@ -37,6 +41,8 @@ public class IrishOaiExporterJob { protected static final int NUM_CONNECTIONS = 20; + public static final String TMP_OAI_TABLE = "temp_oai_data"; + public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( @@ -53,7 +59,6 @@ public class IrishOaiExporterJob { final String inputPath = parser.get("inputPath"); final String dbUrl = parser.get("dbUrl"); - final String dbTable = parser.get("dbTable"); final String dbUser = parser.get("dbUser"); final String dbPwd = parser.get("dbPwd"); final int numConnections = Optional @@ -64,7 +69,6 @@ public class IrishOaiExporterJob { log.info("inputPath: '{}'", inputPath); log.info("dbUrl: '{}'", dbUrl); log.info("dbUser: '{}'", dbUser); - log.info("table: '{}'", dbTable); log.info("dbPwd: '{}'", "xxx"); log.info("numPartitions: '{}'", numConnections); @@ -80,6 +84,7 @@ public class IrishOaiExporterJob { final Encoder encoderTuple = Encoders.bean(TupleWrapper.class); final Encoder encoderOaiRecord = Encoders.bean(OaiRecordWrapper.class); + log.info("Creating temporary table..."); runWithSparkSession(conf, isSparkSessionManaged, spark -> { final Dataset docs = spark @@ -91,12 +96,23 @@ public class IrishOaiExporterJob { .map((MapFunction) IrishOaiExporterJob::asIrishOaiResult, encoderOaiRecord) .filter((FilterFunction) obj -> (obj != null) && StringUtils.isNotBlank(obj.getId())); - docs.repartition(numConnections) + docs + .repartition(numConnections) .write() .mode(SaveMode.Overwrite) - .jdbc(dbUrl, dbTable, connectionProperties); + .jdbc(dbUrl, TMP_OAI_TABLE, connectionProperties); }); + log.info("Temporary table created."); + + log.info("Updating OAI records..."); + try (final Connection con = DriverManager.getConnection(dbUrl, dbUser, dbPwd)) { + try (final Statement st = con.createStatement()) { + final String query = IOUtils.toString(IrishOaiExporterJob.class.getResourceAsStream("oai-finalize.sql")); + st.execute(query); + } + } + log.info("DONE."); } protected static OaiRecordWrapper asIrishOaiResult(final String xml) { @@ -107,7 +123,7 @@ public class IrishOaiExporterJob { if (isValid(doc)) { r.setId(doc.valueOf("//*[local-name()='objIdentifier']").trim()); r.setBody(gzip(doc.selectSingleNode("//*[local-name()='entity']").asXML())); - r.setDate(LocalDateTime.now()); + r.setDate(LocalDateTime.now().toString()); r.setSets(new ArrayList<>()); } return r; diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/OaiRecordWrapper.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/OaiRecordWrapper.java index 4c2766754..2fdf32c96 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/OaiRecordWrapper.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/OaiRecordWrapper.java @@ -1,7 +1,7 @@ + package eu.dnetlib.dhp.oa.oaipmh; import java.io.Serializable; -import java.time.LocalDateTime; import java.util.List; public class OaiRecordWrapper implements Serializable { @@ -10,10 +10,11 @@ public class OaiRecordWrapper implements Serializable { private String id; private byte[] body; - private LocalDateTime date; + private String date; private List sets; - public OaiRecordWrapper() {} + public OaiRecordWrapper() { + } public String getId() { return this.id; @@ -31,11 +32,11 @@ public class OaiRecordWrapper implements Serializable { this.body = body; } - public LocalDateTime getDate() { + public String getDate() { return this.date; } - public void setDate(final LocalDateTime date) { + public void setDate(final String date) { this.date = date; } diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/oaipmh/input_params_irish_oai_exporter.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/oaipmh/input_params_irish_oai_exporter.json index 99a12927b..86b2bb0d3 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/oaipmh/input_params_irish_oai_exporter.json +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/oaipmh/input_params_irish_oai_exporter.json @@ -23,12 +23,6 @@ "paramDescription": "the user of the database", "paramRequired": true }, - { - "paramName": "t", - "paramLongName": "dbTable", - "paramDescription": "the name of the table in the database", - "paramRequired": true - }, { "paramName": "dpwd", "paramLongName": "dbPwd", diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/oaipmh/oai-finalize.sql b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/oaipmh/oai-finalize.sql new file mode 100644 index 000000000..1ec0dfee0 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/oaipmh/oai-finalize.sql @@ -0,0 +1,12 @@ +BEGIN; + +DELETE FROM oai_data; + +INSERT INTO oai_data(id, body, date, sets) SELECT + id, + body, + date::timestamp, + sets +FROM temp_oai_data; + +COMMIT; diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/DbSerializationTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/DbSerializationTest.java new file mode 100644 index 000000000..f33708f86 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/DbSerializationTest.java @@ -0,0 +1,97 @@ + +package eu.dnetlib.dhp.oa.oaipmh; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.IOException; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.Statement; +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +@Disabled +public class DbSerializationTest { + + private static SparkSession spark; + + public static final String dbUrl = "jdbc:postgresql://localhost:5432/db_test"; + public static final String dbUser = null; + public static final String dbPwd = null; + + @BeforeAll + public static void beforeAll() throws IOException { + + final SparkConf conf = new SparkConf(); + conf.setAppName("TEST"); + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + + spark = SparkSession + .builder() + .appName("TEST") + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + spark.stop(); + } + + @Test + public void testDatabaseSerialization() throws Exception { + final Properties connectionProperties = new Properties(); + if (dbUser != null) { + connectionProperties.put("user", dbUser); + } + if (dbPwd != null) { + connectionProperties.put("password", dbPwd); + } + + runWithSparkSession(new SparkConf(), false, spark -> { + + final List list = new ArrayList<>(); + + for (int i = 0; i < 10; i++) { + final OaiRecordWrapper r = new OaiRecordWrapper(); + r.setId("record_" + i); + r.setBody("jsahdjkahdjahdajad".getBytes()); + r.setDate(LocalDateTime.now().toString()); + r.setSets(Arrays.asList()); + list.add(r); + } + + final Dataset docs = spark.createDataset(list, Encoders.bean(OaiRecordWrapper.class)); + + docs + .write() + .mode(SaveMode.Overwrite) + .jdbc(dbUrl, IrishOaiExporterJob.TMP_OAI_TABLE, connectionProperties); + + }); + + try (final Connection con = DriverManager.getConnection(dbUrl, dbUser, dbPwd)) { + try (final Statement st = con.createStatement()) { + final String query = IOUtils.toString(getClass().getResourceAsStream("oai-finalize.sql")); + st.execute(query); + } + } + + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJobTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJobTest.java index e33c701c5..57a32e246 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJobTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJobTest.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.oa.oaipmh; import static org.junit.Assert.assertNull; @@ -17,7 +18,7 @@ import org.dom4j.DocumentException; import org.dom4j.io.SAXReader; import org.junit.jupiter.api.Test; -class IrishOaiExporterJobTest { +public class IrishOaiExporterJobTest { @Test void testAsIrishOaiResult() throws Exception { @@ -66,8 +67,12 @@ class IrishOaiExporterJobTest { } public static String gunzip(final byte[] compressed) { - if ((compressed == null) || (compressed.length == 0)) { return null; } - if (!isCompressed(compressed)) { return new String(compressed); } + if ((compressed == null) || (compressed.length == 0)) { + return null; + } + if (!isCompressed(compressed)) { + return new String(compressed); + } try (final GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(compressed))) { return IOUtils.toString(gis, Charset.defaultCharset()); } catch (final IOException e) { @@ -76,6 +81,7 @@ class IrishOaiExporterJobTest { } private static boolean isCompressed(final byte[] compressed) { - return (compressed[0] == (byte) GZIPInputStream.GZIP_MAGIC) && (compressed[1] == (byte) (GZIPInputStream.GZIP_MAGIC >> 8)); + return (compressed[0] == (byte) GZIPInputStream.GZIP_MAGIC) + && (compressed[1] == (byte) (GZIPInputStream.GZIP_MAGIC >> 8)); } } From ca9414b737a1841eacf5cc7cea6c48f065ab3afc Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Tue, 21 May 2024 09:11:13 +0200 Subject: [PATCH 07/24] Implement multiple node name splitter on GZipCollectorPlugin and all nodes that use XMLIterator. If the splitter name contains is a comma separated values it splits for all the values --- .../plugin/gzip/GzipCollectorPlugin.java | 16 +++++ .../collection/plugin/utils/XMLIterator.java | 47 ++++++++++--- .../plugin/file/FileGZipMultipleNodeTest.java | 63 ++++++++++++++++++ .../dhp/collection/plugin/file/dblp.gz | Bin 0 -> 1097 bytes 4 files changed, 117 insertions(+), 9 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gzip/GzipCollectorPlugin.java create mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipMultipleNodeTest.java create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/dblp.gz diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gzip/GzipCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gzip/GzipCollectorPlugin.java new file mode 100644 index 000000000..44b1eeb18 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gzip/GzipCollectorPlugin.java @@ -0,0 +1,16 @@ +package eu.dnetlib.dhp.collection.plugin.gzip; + +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; +import eu.dnetlib.dhp.common.aggregation.AggregatorReport; +import eu.dnetlib.dhp.common.collection.CollectorException; + +import java.util.stream.Stream; + +public class GzipCollectorPlugin implements CollectorPlugin { + + @Override + public Stream collect(ApiDescriptor api, AggregatorReport report) throws CollectorException { + return Stream.empty(); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java index e05fe263a..ca351346c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java @@ -8,7 +8,10 @@ import java.io.StringWriter; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CodingErrorAction; +import java.util.Arrays; import java.util.Iterator; +import java.util.List; +import java.util.stream.Collectors; import javax.xml.stream.XMLEventFactory; import javax.xml.stream.XMLEventReader; @@ -19,6 +22,7 @@ import javax.xml.stream.XMLStreamException; import javax.xml.stream.events.StartElement; import javax.xml.stream.events.XMLEvent; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -58,13 +62,22 @@ public class XMLIterator implements Iterator { private String element; + private List elements; + private InputStream inputStream; public XMLIterator(final String element, final InputStream inputStream) { super(); this.element = element; + if (element.contains(",")) { + elements= Arrays.stream(element.split(",")) + .filter(StringUtils::isNoneBlank) + .map(String::toLowerCase) + .collect(Collectors.toList()); + } this.inputStream = inputStream; this.parser = getParser(); + try { this.current = findElement(parser); } catch (XMLStreamException e) { @@ -113,7 +126,7 @@ public class XMLIterator implements Iterator { final XMLEvent event = parser.nextEvent(); // TODO: replace with depth tracking instead of close tag tracking. - if (event.isEndElement() && event.asEndElement().getName().getLocalPart().equals(element)) { + if (event.isEndElement() && isCheckTag(event.asEndElement().getName().getLocalPart())) { writer.add(event); break; } @@ -142,18 +155,16 @@ public class XMLIterator implements Iterator { XMLEvent peek = parser.peek(); if (peek != null && peek.isStartElement()) { String name = peek.asStartElement().getName().getLocalPart(); - if (element.equals(name)) { - return peek; - } + if( isCheckTag(name)) + return peek; } while (parser.hasNext()) { - final XMLEvent event = parser.nextEvent(); + XMLEvent event= parser.nextEvent(); if (event != null && event.isStartElement()) { String name = event.asStartElement().getName().getLocalPart(); - if (element.equals(name)) { - return event; - } + if( isCheckTag(name)) + return event; } } return null; @@ -161,12 +172,30 @@ public class XMLIterator implements Iterator { private XMLEventReader getParser() { try { - return inputFactory.get().createXMLEventReader(sanitize(inputStream)); + XMLInputFactory xif = inputFactory.get(); + xif.setProperty(XMLInputFactory.SUPPORT_DTD, false); + return xif.createXMLEventReader(sanitize(inputStream)); } catch (XMLStreamException e) { throw new RuntimeException(e); } } + private boolean isCheckTag(final String tagName) { + if (elements!= null) { + final String found =elements.stream() + .filter(e -> e.equalsIgnoreCase(tagName)) + .findFirst() + .orElse(null); + if (found!= null) + return true; + } else { + if (element.equalsIgnoreCase(tagName)) { + return true; + } + } + return false; + } + private Reader sanitize(final InputStream in) { final CharsetDecoder charsetDecoder = Charset.forName(UTF_8).newDecoder(); charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE); diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipMultipleNodeTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipMultipleNodeTest.java new file mode 100644 index 000000000..2ed199156 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipMultipleNodeTest.java @@ -0,0 +1,63 @@ +package eu.dnetlib.dhp.collection.plugin.file; + + +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.common.aggregation.AggregatorReport; +import eu.dnetlib.dhp.common.collection.CollectorException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.junit.jupiter.api.*; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.junit.jupiter.MockitoExtension; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Objects; +import java.util.stream.Stream; + +@TestMethodOrder(MethodOrderer.OrderAnnotation.class) +@ExtendWith(MockitoExtension.class) +public class FileGZipMultipleNodeTest { + + private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class); + + private final ApiDescriptor api = new ApiDescriptor(); + + private FileGZipCollectorPlugin plugin; + + private static final String SPLIT_ON_ELEMENT = "incollection,article"; + + @BeforeEach + public void setUp() throws IOException { + + final String gzipFile = Objects + .requireNonNull( + this + .getClass() + .getResource("/eu/dnetlib/dhp/collection/plugin/file/dblp.gz")) + .getFile(); + + api.setBaseUrl(gzipFile); + + HashMap params = new HashMap<>(); + params.put("splitOnElement", SPLIT_ON_ELEMENT); + + api.setParams(params); + + FileSystem fs = FileSystem.get(new Configuration()); + plugin = new FileGZipCollectorPlugin(fs); + } + + @Test + void test() throws CollectorException { + + final Stream stream = plugin.collect(api, new AggregatorReport()); + + stream.limit(10).forEach(s -> { + Assertions.assertTrue(s.length() > 0); + log.info(s); + }); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/dblp.gz b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/dblp.gz new file mode 100644 index 0000000000000000000000000000000000000000..979bcbed2845c7c7995db4f1fdc10069f89f1bcb GIT binary patch literal 1097 zcmV-P1h)GhiwFotJWOT)17u=sZ~)DhT~pgQ6hPnmE41#+&h8AhtbCBTYutfADHIAr zQrJH2$hKmu$dX5vv(5hcy|P0?0RyFF1|EVfTUW|CS652-_G=-~g6WDYx#bYgcMy|_ zO1aFooP+6!J01@MmpE^uSK)8FCzILd(>;{NqD0fr)7joJa_o(ln$&?e>~-`C!C%NB ziV!R@5EvCHHEhf2`DEn!BbW3Xbjg-mj%IVFVLXNx+|B_1O$Kr5pt6pIS(eZR9xfB1 zN|ti!w4s~uh8w}6Nv#(wm8`0~5VuY1S4x_(Dk8%m#O>w!(UMUe!E6R0ZZ6kPVx=xu zu~kvlhG}%(Ol<>gqqD2&qWzkuEX3=#*AW zWr~^xQ75*c)8m*7-7$H| zE{8VLo+NTb`-v(aUQ(Pteo0MhJ*PnBL~|w$ZN_@fE2A<^3-re>uM*7+L+5SHKx&Tm z_7DE-^fnrk7win0d%dS>YsfyEwBqs3w#yPIQVV99P$?muI(_svU$zc1{s|q$^Vqmh zGW(%oID@t0&MGdE?b=xh4+;?B?$C;*=!j>zxmTR!6!0{HE{|AKlFcPRX#YAvaV&R7MCk{sxlN*jdH*&%!!>>R`AluC4{DzMnaHaL~bR-LW#Oy(b-|pABK2!w<5m6`y|-E zj`cOb5RuBPhHOXgR4r4$4~y314B&}oW1_;6N@`Kxb^dz>=7+Ud-sb&{>Nx0sPqkae zgS-K@J)!k)@P1E)7!B@VdrFPw$tBwM(7A`~Yn_at7{FJsfZ5kl!{ON$!-EvIdp_q$ zyX8aa+Gmn!K)fw77Hi5AG+kDP75A~8u_TueLs-}z`t)Hsfy3aYztQ0SB)I{LAE3;o zs(`(Uuf{|ZxX|opl6~+)bnnUL&EOYiGt+!QV=lN^ZUB-L(wJTydnYg=9ITpSFr1>U zlDdM@qBLLv1xHjv_OI52ynOa7|CY(aAh;uwTlU*^R#&Fe#Aa}->?EmHMa1J`c82vA zi0zf>Jj8ADB4Pi%Y84Mrk>zF4wWdcE;ZecsW)1v(A=+svAi;D}i!(+ki z*|*G>w)|KZF_fpOQFx>>NZwQ39F7ZaxB4zW%?>P7hCZ@?E5VGh$$Z Date: Tue, 21 May 2024 13:45:29 +0200 Subject: [PATCH 08/24] removed plugin, use only FileGZip plugin --- .../plugin/gzip/GzipCollectorPlugin.java | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gzip/GzipCollectorPlugin.java diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gzip/GzipCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gzip/GzipCollectorPlugin.java deleted file mode 100644 index 44b1eeb18..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gzip/GzipCollectorPlugin.java +++ /dev/null @@ -1,16 +0,0 @@ -package eu.dnetlib.dhp.collection.plugin.gzip; - -import eu.dnetlib.dhp.collection.ApiDescriptor; -import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; -import eu.dnetlib.dhp.common.aggregation.AggregatorReport; -import eu.dnetlib.dhp.common.collection.CollectorException; - -import java.util.stream.Stream; - -public class GzipCollectorPlugin implements CollectorPlugin { - - @Override - public Stream collect(ApiDescriptor api, AggregatorReport report) throws CollectorException { - return Stream.empty(); - } -} From e3f28338c147571f54c81fa9996b0c03f8f95455 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Tue, 28 May 2024 17:51:45 +0300 Subject: [PATCH 09/24] Miscellaneous updates to the copying operation to Impala Cluster: - Assign the WRITE and EXECUTE permissions to the DBs' HDFS-directories, in order to be able to create tables on top of them, in the Impala Cluster. - Make sure the "copydb" function returns early, when it encounters a fatal error, while respecting the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" config. --- .../oozie_app/copyDataToImpalaCluster.sh | 31 +++++++++++++------ .../oozie_app/copyDataToImpalaCluster.sh | 31 +++++++++++++------ .../oozie_app/copyDataToImpalaCluster.sh | 31 +++++++++++++------ .../oozie_app/copyDataToImpalaCluster.sh | 31 +++++++++++++------ 4 files changed, 88 insertions(+), 36 deletions(-) diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh index 26760d650..ca0f7a643 100644 --- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh @@ -72,6 +72,8 @@ function copydb() { rm -f error.log if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 2 + else + return 2 fi fi @@ -90,19 +92,30 @@ function copydb() { -pb \ ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} - # Check the exit status of the "hadoop distcp" command. - if [ $? -eq 0 ]; then - echo -e "\nSuccessfully copied the files of '${db}'.\n" + if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command. + echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n" else echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" rm -f error.log if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 3 + else + return 3 fi fi - # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. - #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db + # Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB. + hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db + # In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param. + if [ $? -ne 0 ]; then # Check the exit status.. + echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n" + rm -f error.log + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 4 + else + return 4 + fi + fi echo -e "\nCreating schema for db: '${db}'\n" @@ -131,7 +144,7 @@ function copydb() { if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 4 + exit 5 fi else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log @@ -139,7 +152,7 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 5 + exit 6 fi fi fi @@ -185,7 +198,7 @@ function copydb() { if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 6 + exit 7 fi elif [[ $new_num_of_views_to_retry -gt 0 ]]; then echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" @@ -215,7 +228,7 @@ function copydb() { echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 7 + exit 8 fi fi diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh index 26760d650..ca0f7a643 100644 --- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh @@ -72,6 +72,8 @@ function copydb() { rm -f error.log if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 2 + else + return 2 fi fi @@ -90,19 +92,30 @@ function copydb() { -pb \ ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} - # Check the exit status of the "hadoop distcp" command. - if [ $? -eq 0 ]; then - echo -e "\nSuccessfully copied the files of '${db}'.\n" + if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command. + echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n" else echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" rm -f error.log if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 3 + else + return 3 fi fi - # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. - #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db + # Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB. + hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db + # In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param. + if [ $? -ne 0 ]; then # Check the exit status.. + echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n" + rm -f error.log + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 4 + else + return 4 + fi + fi echo -e "\nCreating schema for db: '${db}'\n" @@ -131,7 +144,7 @@ function copydb() { if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 4 + exit 5 fi else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log @@ -139,7 +152,7 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 5 + exit 6 fi fi fi @@ -185,7 +198,7 @@ function copydb() { if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 6 + exit 7 fi elif [[ $new_num_of_views_to_retry -gt 0 ]]; then echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" @@ -215,7 +228,7 @@ function copydb() { echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 7 + exit 8 fi fi diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh index 1ab3e417a..dd2203eef 100644 --- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh @@ -72,6 +72,8 @@ function copydb() { rm -f error.log if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 2 + else + return 2 fi fi @@ -90,19 +92,30 @@ function copydb() { -pb \ ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} - # Check the exit status of the "hadoop distcp" command. - if [ $? -eq 0 ]; then - echo -e "\nSuccessfully copied the files of '${db}'.\n" + if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command. + echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n" else echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" rm -f error.log if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 3 + else + return 3 fi fi - # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. - #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db + # Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB. + hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db + # In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param. + if [ $? -ne 0 ]; then # Check the exit status.. + echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n" + rm -f error.log + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 4 + else + return 4 + fi + fi echo -e "\nCreating schema for db: '${db}'\n" @@ -131,7 +144,7 @@ function copydb() { if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 4 + exit 5 fi else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log @@ -139,7 +152,7 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 5 + exit 6 fi fi fi @@ -185,7 +198,7 @@ function copydb() { if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 6 + exit 7 fi elif [[ $new_num_of_views_to_retry -gt 0 ]]; then echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" @@ -215,7 +228,7 @@ function copydb() { echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 7 + exit 8 fi fi diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index 7957a659c..918775f49 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -74,6 +74,8 @@ function copydb() { rm -f error.log if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 2 + else + return 2 fi fi @@ -92,19 +94,30 @@ function copydb() { -pb \ ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} - # Check the exit status of the "hadoop distcp" command. - if [ $? -eq 0 ]; then - echo -e "\nSuccessfully copied the files of '${db}'.\n" + if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command. + echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n" else echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" rm -f error.log if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 3 + else + return 3 fi fi - # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. - #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db + # Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB. + hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db + # In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param. + if [ $? -ne 0 ]; then # Check the exit status.. + echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n" + rm -f error.log + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 4 + else + return 4 + fi + fi echo -e "\nCreating schema for db: '${db}'\n" @@ -133,7 +146,7 @@ function copydb() { if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 4 + exit 5 fi else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log @@ -141,7 +154,7 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 5 + exit 6 fi fi fi @@ -187,7 +200,7 @@ function copydb() { if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 6 + exit 7 fi elif [[ $new_num_of_views_to_retry -gt 0 ]]; then echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" @@ -217,7 +230,7 @@ function copydb() { echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 7 + exit 8 fi fi From e996787be232c0a4f214712d6fffc0884ab2c400 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Wed, 29 May 2024 15:05:17 +0200 Subject: [PATCH 10/24] OSF test --- .../collection/plugin/rest/OsfPreprintCollectorTest.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java index 90f4c7f25..0e64f8bab 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java @@ -39,8 +39,8 @@ public class OsfPreprintCollectorTest { private final String resumptionType = "page"; private final String resumptionXpath = "/*/*[local-name()='links']/*[local-name()='next']"; - private final String resultSizeParam = ""; - private final String resultSizeValue = ""; + private final String resultSizeParam = "page[size]"; + private final String resultSizeValue = "100"; private final String resultFormatParam = "format"; private final String resultFormatValue = "json"; @@ -74,7 +74,7 @@ public class OsfPreprintCollectorTest { final AtomicInteger i = new AtomicInteger(0); final Stream stream = this.rcp.collect(this.api, new AggregatorReport()); - stream.limit(200).forEach(s -> { + stream.limit(2000).forEach(s -> { Assertions.assertTrue(s.length() > 0); i.incrementAndGet(); log.info(s); From 1b165a14a09394adda40aca0eb3df238d471a448 Mon Sep 17 00:00:00 2001 From: Alessia Date: Wed, 29 May 2024 15:41:36 +0200 Subject: [PATCH 11/24] Rest collector plugin on hadoop supports a new param to pass request headers --- .../plugin/rest/RestCollectorPlugin.java | 11 ++-- .../collection/plugin/rest/RestIterator.java | 62 +++++++++++++------ .../plugin/rest/RestCollectorPluginTest.java | 42 ++++++++++--- .../plugin/rest/RestIteratorTest.java | 2 +- 4 files changed, 83 insertions(+), 34 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java index 997948687..8445e49e0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java @@ -1,12 +1,14 @@ package eu.dnetlib.dhp.collection.plugin.rest; +import java.util.Map; import java.util.Optional; import java.util.Spliterator; import java.util.Spliterators; import java.util.stream.Stream; import java.util.stream.StreamSupport; +import com.google.gson.Gson; import org.apache.commons.lang3.StringUtils; import eu.dnetlib.dhp.collection.ApiDescriptor; @@ -47,6 +49,9 @@ public class RestCollectorPlugin implements CollectorPlugin { final String entityXpath = api.getParams().get("entityXpath"); final String authMethod = api.getParams().get("authMethod"); final String authToken = api.getParams().get("authToken"); + final String requestHeaderMap = api.getParams().get("requestHeaderMap"); + Gson gson = new Gson(); + Map requestHeaders = gson.fromJson(requestHeaderMap, Map.class); final String resultSizeValue = Optional .ofNullable(api.getParams().get("resultSizeValue")) .filter(StringUtils::isNotBlank) @@ -64,9 +69,6 @@ public class RestCollectorPlugin implements CollectorPlugin { if (StringUtils.isBlank(resultFormatValue)) { throw new CollectorException("Param 'resultFormatValue' is null or empty"); } - if (StringUtils.isBlank(queryParams)) { - throw new CollectorException("Param 'queryParams' is null or empty"); - } if (StringUtils.isBlank(entityXpath)) { throw new CollectorException("Param 'entityXpath' is null or empty"); } @@ -92,7 +94,8 @@ public class RestCollectorPlugin implements CollectorPlugin { entityXpath, authMethod, authToken, - resultOutputFormat); + resultOutputFormat, + requestHeaders); return StreamSupport .stream( diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java index 76af6cff1..e51c9eb1b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java @@ -9,6 +9,7 @@ import java.net.URL; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; import java.util.Iterator; +import java.util.Map; import java.util.Queue; import java.util.concurrent.PriorityBlockingQueue; @@ -24,6 +25,7 @@ import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; +import com.google.common.collect.Maps; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.http.HttpHeaders; @@ -49,13 +51,14 @@ import eu.dnetlib.dhp.common.collection.HttpClientParams; */ public class RestIterator implements Iterator { + private static final Logger log = LoggerFactory.getLogger(RestIterator.class); public static final String UTF_8 = "UTF-8"; private static final int MAX_ATTEMPTS = 5; private final HttpClientParams clientParams; - private final String BASIC = "basic"; + private final String AUTHBASIC = "basic"; private final String baseUrl; private final String resumptionType; @@ -89,6 +92,12 @@ public class RestIterator implements Iterator { */ private final String resultOutputFormat; + /* + Can be used to set additional request headers, like for content negotiation + */ + private Map requestHeaders; + + /** * RestIterator class compatible to version 1.3.33 */ @@ -107,7 +116,8 @@ public class RestIterator implements Iterator { final String entityXpath, final String authMethod, final String authToken, - final String resultOutputFormat) { + final String resultOutputFormat, + final Map requestHeaders) { this.clientParams = clientParams; this.baseUrl = baseUrl; @@ -119,6 +129,7 @@ public class RestIterator implements Iterator { this.authMethod = authMethod; this.authToken = authToken; this.resultOutputFormat = resultOutputFormat; + this.requestHeaders = requestHeaders != null ? requestHeaders : Maps.newHashMap(); this.queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : ""; @@ -231,25 +242,20 @@ public class RestIterator implements Iterator { final URL qUrl = new URL(query); log.debug("authMethod: {}", this.authMethod); - if ("bearer".equalsIgnoreCase(this.authMethod)) { - log.trace("authMethod before inputStream: {}", resultXml); - final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); - conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + this.authToken); - conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType()); - conn.setRequestMethod("GET"); - theHttpInputStream = conn.getInputStream(); - } else if (this.BASIC.equalsIgnoreCase(this.authMethod)) { - log.trace("authMethod before inputStream: {}", resultXml); - final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); - conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + this.authToken); - conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType()); - conn.setRequestMethod("GET"); - theHttpInputStream = conn.getInputStream(); - } else { - theHttpInputStream = qUrl.openStream(); + if (this.authMethod == "bearer") { + log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml); + requestHeaders.put("Authorization", "Bearer " + authToken); + //requestHeaders.put("Content-Type", "application/json"); + } else if (AUTHBASIC.equalsIgnoreCase(this.authMethod)) { + log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml); + requestHeaders.put("Authorization", "Basic " + authToken); + //requestHeaders.put("accept", "application/xml"); } + HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); + conn.setRequestMethod("GET"); + this.setRequestHeader(conn); + resultStream = conn.getInputStream(); - this.resultStream = theHttpInputStream; if ("json".equals(this.resultOutputFormat)) { resultJson = IOUtils.toString(this.resultStream, StandardCharsets.UTF_8); resultXml = JsonUtils.convertToXML(resultJson); @@ -380,7 +386,7 @@ public class RestIterator implements Iterator { try { if (this.resultTotal == -1) { this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode)); - if ("page".equalsIgnoreCase(this.resumptionType) && !this.BASIC.equalsIgnoreCase(this.authMethod)) { + if ("page".equalsIgnoreCase(this.resumptionType) && !this.AUTHBASIC.equalsIgnoreCase(this.authMethod)) { this.resultTotal += 1; } // to correct the upper bound log.info("resultTotal was -1 is now: " + this.resultTotal); @@ -433,6 +439,22 @@ public class RestIterator implements Iterator { } } + /** + * setRequestHeader + * + * setRequestProperty: Sets the general request property. If a property with the key already exists, overwrite its value with the new value. + * @param conn + */ + private void setRequestHeader(HttpURLConnection conn) { + if (requestHeaders != null) { + for (String key : requestHeaders.keySet()) { + conn.setRequestProperty(key, requestHeaders.get(key)); + } + log.debug("Set Request Header with: " + requestHeaders); + } + + } + public String getResultFormatValue() { return this.resultFormatValue; } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java index f708c367b..a9fc325c3 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java @@ -4,10 +4,16 @@ package eu.dnetlib.dhp.collection.plugin.rest; +import java.io.IOException; +import java.io.InputStream; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URL; import java.util.HashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Stream; +import com.google.gson.Gson; import org.junit.jupiter.api.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -25,18 +31,18 @@ class RestCollectorPluginTest { private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class); - private final String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search"; - private final String resumptionType = "count"; - private final String resumptionParam = "from"; - private final String entityXpath = "//hits/hits"; - private final String resumptionXpath = "//hits"; - private final String resultTotalXpath = "//hits/total"; - private final String resultFormatParam = "format"; + private final String baseUrl = "https://ddh-openapi.worldbank.org/search"; + private final String resumptionType = "discover"; + private final String resumptionParam = "skip"; + private final String entityXpath = "//*[local-name()='data']"; + private final String resumptionXpath = ""; + private final String resultTotalXpath = "//*[local-name()='count']"; + private final String resultFormatParam = ""; private final String resultFormatValue = "json"; - private final String resultSizeParam = "size"; + private final String resultSizeParam = "top"; private final String resultSizeValue = "10"; // private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29"; - private final String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29"; + private final String query = ""; // private String query = "=(sources:engrXiv AND type:preprint)"; private final String protocolDescriptor = "rest_json2xml"; @@ -56,10 +62,12 @@ class RestCollectorPluginTest { params.put("resultSizeValue", resultSizeValue); params.put("queryParams", query); params.put("entityXpath", entityXpath); + params.put("requestHeaderMap", "{\"User-Agent\": \"OpenAIRE DEV\"}"); api.setBaseUrl(baseUrl); api.setParams(params); + rcp = new RestCollectorPlugin(new HttpClientParams()); } @@ -78,4 +86,20 @@ class RestCollectorPluginTest { log.info("{}", i.intValue()); Assertions.assertTrue(i.intValue() > 0); } + + @Disabled + @Test + void testUrl() throws IOException { + String url_s = "https://ddh-openapi.worldbank.org/search?&top=10"; + URL url = new URL(url_s); + final HttpURLConnection conn = (HttpURLConnection) url.openConnection(); + conn.setRequestMethod("GET"); + conn.setRequestProperty("User-Agent", "OpenAIRE"); + Gson gson = new Gson(); + System.out.println("Request header"); + System.out.println(gson.toJson(conn.getHeaderFields())); + InputStream inputStream = conn.getInputStream(); + + + } } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java index e2d6ad3e7..ed31c2b7e 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java @@ -44,7 +44,7 @@ public class RestIteratorTest { final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam, resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue, - query, entityXpath, authMethod, authToken, resultOffsetParam); + query, entityXpath, authMethod, authToken, resultOffsetParam, null); int i = 20; while (iterator.hasNext() && i > 0) { String result = iterator.next(); From c272c4ad68255820fe6d9fd3d4aac182da1f5678 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 29 May 2024 15:50:07 +0200 Subject: [PATCH 12/24] code formatting --- .../plugin/rest/RestCollectorPlugin.java | 5 +++-- .../dhp/collection/plugin/rest/RestIterator.java | 16 ++++++++-------- .../plugin/rest/RestCollectorPluginTest.java | 5 ++--- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java index 8445e49e0..f4ba09f72 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java @@ -8,9 +8,10 @@ import java.util.Spliterators; import java.util.stream.Stream; import java.util.stream.StreamSupport; -import com.google.gson.Gson; import org.apache.commons.lang3.StringUtils; +import com.google.gson.Gson; + import eu.dnetlib.dhp.collection.ApiDescriptor; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.common.aggregation.AggregatorReport; @@ -95,7 +96,7 @@ public class RestCollectorPlugin implements CollectorPlugin { authMethod, authToken, resultOutputFormat, - requestHeaders); + requestHeaders); return StreamSupport .stream( diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java index e51c9eb1b..2518fd92f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java @@ -25,7 +25,6 @@ import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; -import com.google.common.collect.Maps; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.http.HttpHeaders; @@ -36,6 +35,8 @@ import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; +import com.google.common.collect.Maps; + import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils; import eu.dnetlib.dhp.common.collection.CollectorException; import eu.dnetlib.dhp.common.collection.HttpClientParams; @@ -51,7 +52,6 @@ import eu.dnetlib.dhp.common.collection.HttpClientParams; */ public class RestIterator implements Iterator { - private static final Logger log = LoggerFactory.getLogger(RestIterator.class); public static final String UTF_8 = "UTF-8"; private static final int MAX_ATTEMPTS = 5; @@ -93,11 +93,10 @@ public class RestIterator implements Iterator { private final String resultOutputFormat; /* - Can be used to set additional request headers, like for content negotiation - */ + * Can be used to set additional request headers, like for content negotiation + */ private Map requestHeaders; - /** * RestIterator class compatible to version 1.3.33 */ @@ -245,11 +244,11 @@ public class RestIterator implements Iterator { if (this.authMethod == "bearer") { log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml); requestHeaders.put("Authorization", "Bearer " + authToken); - //requestHeaders.put("Content-Type", "application/json"); + // requestHeaders.put("Content-Type", "application/json"); } else if (AUTHBASIC.equalsIgnoreCase(this.authMethod)) { log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml); requestHeaders.put("Authorization", "Basic " + authToken); - //requestHeaders.put("accept", "application/xml"); + // requestHeaders.put("accept", "application/xml"); } HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); conn.setRequestMethod("GET"); @@ -386,7 +385,8 @@ public class RestIterator implements Iterator { try { if (this.resultTotal == -1) { this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode)); - if ("page".equalsIgnoreCase(this.resumptionType) && !this.AUTHBASIC.equalsIgnoreCase(this.authMethod)) { + if ("page".equalsIgnoreCase(this.resumptionType) + && !this.AUTHBASIC.equalsIgnoreCase(this.authMethod)) { this.resultTotal += 1; } // to correct the upper bound log.info("resultTotal was -1 is now: " + this.resultTotal); diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java index a9fc325c3..99b95d9e3 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java @@ -13,11 +13,12 @@ import java.util.HashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Stream; -import com.google.gson.Gson; import org.junit.jupiter.api.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.gson.Gson; + import eu.dnetlib.dhp.collection.ApiDescriptor; import eu.dnetlib.dhp.common.aggregation.AggregatorReport; import eu.dnetlib.dhp.common.collection.CollectorException; @@ -67,7 +68,6 @@ class RestCollectorPluginTest { api.setBaseUrl(baseUrl); api.setParams(params); - rcp = new RestCollectorPlugin(new HttpClientParams()); } @@ -100,6 +100,5 @@ class RestCollectorPluginTest { System.out.println(gson.toJson(conn.getHeaderFields())); InputStream inputStream = conn.getInputStream(); - } } From a02f3f0d2b5b87fe6cf2b2b37f0e54832b16ce8c Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 30 May 2024 10:21:18 +0200 Subject: [PATCH 13/24] code formatting --- .../collection/plugin/utils/XMLIterator.java | 32 +++++---- .../plugin/file/FileGZipMultipleNodeTest.java | 69 ++++++++++--------- 2 files changed, 52 insertions(+), 49 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java index ca351346c..7e5c5e3c3 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java @@ -70,10 +70,11 @@ public class XMLIterator implements Iterator { super(); this.element = element; if (element.contains(",")) { - elements= Arrays.stream(element.split(",")) - .filter(StringUtils::isNoneBlank) - .map(String::toLowerCase) - .collect(Collectors.toList()); + elements = Arrays + .stream(element.split(",")) + .filter(StringUtils::isNoneBlank) + .map(String::toLowerCase) + .collect(Collectors.toList()); } this.inputStream = inputStream; this.parser = getParser(); @@ -155,16 +156,16 @@ public class XMLIterator implements Iterator { XMLEvent peek = parser.peek(); if (peek != null && peek.isStartElement()) { String name = peek.asStartElement().getName().getLocalPart(); - if( isCheckTag(name)) - return peek; + if (isCheckTag(name)) + return peek; } while (parser.hasNext()) { - XMLEvent event= parser.nextEvent(); + XMLEvent event = parser.nextEvent(); if (event != null && event.isStartElement()) { String name = event.asStartElement().getName().getLocalPart(); - if( isCheckTag(name)) - return event; + if (isCheckTag(name)) + return event; } } return null; @@ -181,12 +182,13 @@ public class XMLIterator implements Iterator { } private boolean isCheckTag(final String tagName) { - if (elements!= null) { - final String found =elements.stream() - .filter(e -> e.equalsIgnoreCase(tagName)) - .findFirst() - .orElse(null); - if (found!= null) + if (elements != null) { + final String found = elements + .stream() + .filter(e -> e.equalsIgnoreCase(tagName)) + .findFirst() + .orElse(null); + if (found != null) return true; } else { if (element.equalsIgnoreCase(tagName)) { diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipMultipleNodeTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipMultipleNodeTest.java index 2ed199156..2b5e90ab2 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipMultipleNodeTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipMultipleNodeTest.java @@ -1,9 +1,11 @@ + package eu.dnetlib.dhp.collection.plugin.file; +import java.io.IOException; +import java.util.HashMap; +import java.util.Objects; +import java.util.stream.Stream; -import eu.dnetlib.dhp.collection.ApiDescriptor; -import eu.dnetlib.dhp.common.aggregation.AggregatorReport; -import eu.dnetlib.dhp.common.collection.CollectorException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.junit.jupiter.api.*; @@ -12,52 +14,51 @@ import org.mockito.junit.jupiter.MockitoExtension; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.util.HashMap; -import java.util.Objects; -import java.util.stream.Stream; +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.common.aggregation.AggregatorReport; +import eu.dnetlib.dhp.common.collection.CollectorException; @TestMethodOrder(MethodOrderer.OrderAnnotation.class) @ExtendWith(MockitoExtension.class) public class FileGZipMultipleNodeTest { - private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class); + private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class); - private final ApiDescriptor api = new ApiDescriptor(); + private final ApiDescriptor api = new ApiDescriptor(); - private FileGZipCollectorPlugin plugin; + private FileGZipCollectorPlugin plugin; - private static final String SPLIT_ON_ELEMENT = "incollection,article"; + private static final String SPLIT_ON_ELEMENT = "incollection,article"; - @BeforeEach - public void setUp() throws IOException { + @BeforeEach + public void setUp() throws IOException { - final String gzipFile = Objects - .requireNonNull( - this - .getClass() - .getResource("/eu/dnetlib/dhp/collection/plugin/file/dblp.gz")) - .getFile(); + final String gzipFile = Objects + .requireNonNull( + this + .getClass() + .getResource("/eu/dnetlib/dhp/collection/plugin/file/dblp.gz")) + .getFile(); - api.setBaseUrl(gzipFile); + api.setBaseUrl(gzipFile); - HashMap params = new HashMap<>(); - params.put("splitOnElement", SPLIT_ON_ELEMENT); + HashMap params = new HashMap<>(); + params.put("splitOnElement", SPLIT_ON_ELEMENT); - api.setParams(params); + api.setParams(params); - FileSystem fs = FileSystem.get(new Configuration()); - plugin = new FileGZipCollectorPlugin(fs); - } + FileSystem fs = FileSystem.get(new Configuration()); + plugin = new FileGZipCollectorPlugin(fs); + } - @Test - void test() throws CollectorException { + @Test + void test() throws CollectorException { - final Stream stream = plugin.collect(api, new AggregatorReport()); + final Stream stream = plugin.collect(api, new AggregatorReport()); - stream.limit(10).forEach(s -> { - Assertions.assertTrue(s.length() > 0); - log.info(s); - }); - } + stream.limit(10).forEach(s -> { + Assertions.assertTrue(s.length() > 0); + log.info(s); + }); + } } From 81090ad593b1bb1572c033989c86e79f795670e6 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 5 Jun 2024 10:03:33 +0200 Subject: [PATCH 14/24] [IE OAIPHM] added oozie workflow, minor changes, code formatting --- .../dhp/oa/oaipmh/IrishOaiExporterJob.java | 68 ++++++----- .../dhp/oa/oaipmh/oozie_app/workflow.xml | 106 ++++++++++++++++++ .../dhp/oa/oaipmh/DbSerializationTest.java | 14 +-- .../oa/oaipmh/IrishOaiExporterJobTest.java | 3 +- 4 files changed, 155 insertions(+), 36 deletions(-) create mode 100644 dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/oaipmh/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java index 433baf272..57f180fa0 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java @@ -46,15 +46,16 @@ public class IrishOaiExporterJob { public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString(XmlConverterJob.class - .getResourceAsStream("/eu/dnetlib/dhp/oa/oaipmh/input_params_irish_oai_exporter.json"))); + IOUtils + .toString( + XmlConverterJob.class + .getResourceAsStream("/eu/dnetlib/dhp/oa/oaipmh/input_params_irish_oai_exporter.json"))); parser.parseArgument(args); final Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); final String inputPath = parser.get("inputPath"); @@ -62,9 +63,9 @@ public class IrishOaiExporterJob { final String dbUser = parser.get("dbUser"); final String dbPwd = parser.get("dbPwd"); final int numConnections = Optional - .ofNullable(parser.get("numConnections")) - .map(Integer::valueOf) - .orElse(NUM_CONNECTIONS); + .ofNullable(parser.get("numConnections")) + .map(Integer::valueOf) + .orElse(NUM_CONNECTIONS); log.info("inputPath: '{}'", inputPath); log.info("dbUrl: '{}'", dbUrl); @@ -78,29 +79,31 @@ public class IrishOaiExporterJob { final SparkConf conf = new SparkConf(); conf.registerKryoClasses(new Class[] { - SerializableSolrInputDocument.class + SerializableSolrInputDocument.class }); final Encoder encoderTuple = Encoders.bean(TupleWrapper.class); final Encoder encoderOaiRecord = Encoders.bean(OaiRecordWrapper.class); + final String date = LocalDateTime.now().toString(); + log.info("Creating temporary table..."); runWithSparkSession(conf, isSparkSessionManaged, spark -> { final Dataset docs = spark - .read() - .schema(encoderTuple.schema()) - .json(inputPath) - .as(encoderTuple) - .map((MapFunction) TupleWrapper::getXml, Encoders.STRING()) - .map((MapFunction) IrishOaiExporterJob::asIrishOaiResult, encoderOaiRecord) - .filter((FilterFunction) obj -> (obj != null) && StringUtils.isNotBlank(obj.getId())); + .read() + .schema(encoderTuple.schema()) + .json(inputPath) + .as(encoderTuple) + .map((MapFunction) TupleWrapper::getXml, Encoders.STRING()) + .map((MapFunction) r -> asIrishOaiResult(r, date), encoderOaiRecord) + .filter((FilterFunction) obj -> (obj != null) && StringUtils.isNotBlank(obj.getId())); docs - .repartition(numConnections) - .write() - .mode(SaveMode.Overwrite) - .jdbc(dbUrl, TMP_OAI_TABLE, connectionProperties); + .repartition(numConnections) + .write() + .mode(SaveMode.Overwrite) + .jdbc(dbUrl, TMP_OAI_TABLE, connectionProperties); }); log.info("Temporary table created."); @@ -108,14 +111,15 @@ public class IrishOaiExporterJob { log.info("Updating OAI records..."); try (final Connection con = DriverManager.getConnection(dbUrl, dbUser, dbPwd)) { try (final Statement st = con.createStatement()) { - final String query = IOUtils.toString(IrishOaiExporterJob.class.getResourceAsStream("oai-finalize.sql")); + final String query = IOUtils + .toString(IrishOaiExporterJob.class.getResourceAsStream("oai-finalize.sql")); st.execute(query); } } log.info("DONE."); } - protected static OaiRecordWrapper asIrishOaiResult(final String xml) { + protected static OaiRecordWrapper asIrishOaiResult(final String xml, final String date) { try { final Document doc = DocumentHelper.parseText(xml); final OaiRecordWrapper r = new OaiRecordWrapper(); @@ -123,7 +127,7 @@ public class IrishOaiExporterJob { if (isValid(doc)) { r.setId(doc.valueOf("//*[local-name()='objIdentifier']").trim()); r.setBody(gzip(doc.selectSingleNode("//*[local-name()='entity']").asXML())); - r.setDate(LocalDateTime.now().toString()); + r.setDate(date); r.setSets(new ArrayList<>()); } return r; @@ -140,19 +144,25 @@ public class IrishOaiExporterJob { if (n != null) { for (final Object o : n.selectNodes(".//*[local-name()='datainfo']/*[local-name()='deletedbyinference']")) { - if ("true".equals(((Node) o).getText().trim())) { return false; } + if ("true".equals(((Node) o).getText().trim())) { + return false; + } } // verify the main country of the result for (final Object o : n.selectNodes("./*[local-name()='country']")) { - if ("IE".equals(((Node) o).valueOf("@classid").trim())) { return true; } + if ("IE".equals(((Node) o).valueOf("@classid").trim())) { + return true; + } } // verify the countries of the related organizations for (final Object o : n.selectNodes(".//*[local-name()='rel']")) { final String relType = ((Node) o).valueOf("./*[local-name() = 'to']/@type").trim(); final String relCountry = ((Node) o).valueOf("./*[local-name() = 'country']/@classid").trim(); - if ("organization".equals(relType) && "IE".equals(relCountry)) { return true; } + if ("organization".equals(relType) && "IE".equals(relCountry)) { + return true; + } } } return false; @@ -160,7 +170,9 @@ public class IrishOaiExporterJob { } protected static byte[] gzip(final String str) { - if (StringUtils.isBlank(str)) { return null; } + if (StringUtils.isBlank(str)) { + return null; + } try (final ByteArrayOutputStream baos = new ByteArrayOutputStream()) { try (final GZIPOutputStream gzip = new GZIPOutputStream(baos)) { diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/oaipmh/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/oaipmh/oozie_app/workflow.xml new file mode 100644 index 000000000..c4caad91e --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/oaipmh/oozie_app/workflow.xml @@ -0,0 +1,106 @@ + + + + + inputPath + The path of the input records on HDFS + + + numConnections + number of connections to the postgres db (for the write operation) + + + dbUrl + the url of the database + + + dbUser + the user of the database + + + dbPwd + the password for the user of the database + + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + yarn + cluster + Irish OAI-PHM provision + eu.dnetlib.dhp.oa.oaipmh.IrishOaiExporterJob + dhp-graph-provision-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=8000 + + --inputPath${inputPath} + --numConnections${numConnections} + --dbUrl${dbUrl} + --dbUser${dbUser} + --dbPwd${dbPwd} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/DbSerializationTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/DbSerializationTest.java index f33708f86..d487fda94 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/DbSerializationTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/DbSerializationTest.java @@ -42,10 +42,10 @@ public class DbSerializationTest { conf.set("spark.driver.host", "localhost"); spark = SparkSession - .builder() - .appName("TEST") - .config(conf) - .getOrCreate(); + .builder() + .appName("TEST") + .config(conf) + .getOrCreate(); } @AfterAll @@ -79,9 +79,9 @@ public class DbSerializationTest { final Dataset docs = spark.createDataset(list, Encoders.bean(OaiRecordWrapper.class)); docs - .write() - .mode(SaveMode.Overwrite) - .jdbc(dbUrl, IrishOaiExporterJob.TMP_OAI_TABLE, connectionProperties); + .write() + .mode(SaveMode.Overwrite) + .jdbc(dbUrl, IrishOaiExporterJob.TMP_OAI_TABLE, connectionProperties); }); diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJobTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJobTest.java index 57a32e246..c16f75e1d 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJobTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJobTest.java @@ -10,6 +10,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.ByteArrayInputStream; import java.io.IOException; import java.nio.charset.Charset; +import java.time.LocalDateTime; import java.util.zip.GZIPInputStream; import org.apache.commons.io.IOUtils; @@ -23,7 +24,7 @@ public class IrishOaiExporterJobTest { @Test void testAsIrishOaiResult() throws Exception { final String xml = IOUtils.toString(getClass().getResourceAsStream("record_IE.xml")); - final OaiRecordWrapper res = IrishOaiExporterJob.asIrishOaiResult(xml); + final OaiRecordWrapper res = IrishOaiExporterJob.asIrishOaiResult(xml, LocalDateTime.now().toString()); assertNotNull(res.getId()); assertNotNull(res.getBody()); assertNotNull(res.getSets()); From 73bd1938a5a8dfead4035f50fded1108cf45a281 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 5 Jun 2024 12:17:35 +0200 Subject: [PATCH 15/24] [graph2hive] use sparkExecutorMemory to define also the memoryOverhead --- .../dhp/oa/graph/hive/oozie_app/workflow.xml | 30 ++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml index 4468382be..eec67fc5c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml @@ -116,17 +116,19 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + --conf spark.sql.shuffle.partitions=10000 --inputPath${inputPath}/publication --hiveDbName${hiveDbName} --classNameeu.dnetlib.dhp.schema.oaf.Publication --hiveMetastoreUris${hiveMetastoreUris} - --numPartitions8000 + --numPartitions10000 @@ -143,17 +145,19 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + --conf spark.sql.shuffle.partitions=4000 --inputPath${inputPath}/dataset --hiveDbName${hiveDbName} --classNameeu.dnetlib.dhp.schema.oaf.Dataset --hiveMetastoreUris${hiveMetastoreUris} - --numPartitions4000 + --numPartitions8000 @@ -170,11 +174,13 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + --conf spark.sql.shuffle.partitions=8000 --inputPath${inputPath}/otherresearchproduct --hiveDbName${hiveDbName} @@ -197,17 +203,19 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + --conf spark.sql.shuffle.partitions=1000 --inputPath${inputPath}/software --hiveDbName${hiveDbName} --classNameeu.dnetlib.dhp.schema.oaf.Software --hiveMetastoreUris${hiveMetastoreUris} - --numPartitions300 + --numPartitions1000 @@ -224,17 +232,19 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + --conf spark.sql.shuffle.partitions=200 --inputPath${inputPath}/datasource --hiveDbName${hiveDbName} --classNameeu.dnetlib.dhp.schema.oaf.Datasource --hiveMetastoreUris${hiveMetastoreUris} - --numPartitions100 + --numPartitions200 @@ -251,17 +261,19 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + --conf spark.sql.shuffle.partitions=1000 --inputPath${inputPath}/organization --hiveDbName${hiveDbName} --classNameeu.dnetlib.dhp.schema.oaf.Organization --hiveMetastoreUris${hiveMetastoreUris} - --numPartitions400 + --numPartitions1000 @@ -278,17 +290,19 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + --conf spark.sql.shuffle.partitions=1000 --inputPath${inputPath}/project --hiveDbName${hiveDbName} --classNameeu.dnetlib.dhp.schema.oaf.Project --hiveMetastoreUris${hiveMetastoreUris} - --numPartitions100 + --numPartitions1000 @@ -305,17 +319,19 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + --conf spark.sql.shuffle.partitions=15000 --inputPath${inputPath}/relation --hiveDbName${hiveDbName} --classNameeu.dnetlib.dhp.schema.oaf.Relation --hiveMetastoreUris${hiveMetastoreUris} - --numPartitions10000 + --numPartitions15000 From f70dc76b61a2597ed1a62d90a5f8394cf8710f19 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 6 Jun 2024 10:43:10 +0200 Subject: [PATCH 16/24] minor --- .../oaf/utils/ResultTypeComparator.java | 87 ------------------- .../dhp/oa/oaipmh/IrishOaiExporterJob.java | 3 +- 2 files changed, 1 insertion(+), 89 deletions(-) delete mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultTypeComparator.java diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultTypeComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultTypeComparator.java deleted file mode 100644 index e10b281b8..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultTypeComparator.java +++ /dev/null @@ -1,87 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf.utils; - -import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID; - -import java.util.Comparator; -import java.util.HashSet; -import java.util.Optional; -import java.util.stream.Collectors; - -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.KeyValue; -import eu.dnetlib.dhp.schema.oaf.Result; - -public class ResultTypeComparator implements Comparator { - - public static final ResultTypeComparator INSTANCE = new ResultTypeComparator(); - - @Override - public int compare(Result left, Result right) { - - if (left == null && right == null) - return 0; - if (left == null) - return 1; - if (right == null) - return -1; - - HashSet lCf = getCollectedFromIds(left); - HashSet rCf = getCollectedFromIds(right); - - if (lCf.contains(CROSSREF_ID) && !rCf.contains(CROSSREF_ID)) { - return -1; - } - if (!lCf.contains(CROSSREF_ID) && rCf.contains(CROSSREF_ID)) { - return 1; - } - - if (left.getResulttype() == null || left.getResulttype().getClassid() == null) { - if (right.getResulttype() == null || right.getResulttype().getClassid() == null) { - return 0; - } - return 1; - } else if (right.getResulttype() == null || right.getResulttype().getClassid() == null) { - return -1; - } - - String lClass = left.getResulttype().getClassid(); - String rClass = right.getResulttype().getClassid(); - - if (!lClass.equals(rClass)) { - if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID)) - return -1; - if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID)) - return 1; - - if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID)) - return -1; - if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID)) - return 1; - - if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID)) - return -1; - if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID)) - return 1; - - if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID)) - return -1; - if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID)) - return 1; - } - - // Else (but unlikely), lexicographical ordering will do. - return lClass.compareTo(rClass); - } - - protected HashSet getCollectedFromIds(Result left) { - return Optional - .ofNullable(left.getCollectedfrom()) - .map( - cf -> cf - .stream() - .map(KeyValue::getKey) - .collect(Collectors.toCollection(HashSet::new))) - .orElse(new HashSet<>()); - } -} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java index 57f180fa0..3d69370af 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java @@ -31,7 +31,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.oa.provision.XmlConverterJob; import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument; import eu.dnetlib.dhp.oa.provision.model.TupleWrapper; @@ -48,7 +47,7 @@ public class IrishOaiExporterJob { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils .toString( - XmlConverterJob.class + IrishOaiExporterJob.class .getResourceAsStream("/eu/dnetlib/dhp/oa/oaipmh/input_params_irish_oai_exporter.json"))); parser.parseArgument(args); From ce2364743a445e42c434c910852dc26a1b75a7a8 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 6 Jun 2024 10:43:43 +0200 Subject: [PATCH 17/24] applying changes from PR#442: Fix for missing collectedfrom after dedup --- .../oaf/utils/MergeEntitiesComparator.java | 104 ++++++++++++++++++ .../dhp/schema/oaf/utils/MergeUtils.java | 53 +++++---- .../dhp/oa/dedup/DatasetMergerTest.java | 103 +++++++++++++++++ .../dhp/oa/dedup/EntityMergerTest.java | 10 +- .../dnetlib/dhp/dedup/json/dataset_merge.json | 2 + 5 files changed, 239 insertions(+), 33 deletions(-) create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeEntitiesComparator.java create mode 100644 dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DatasetMergerTest.java create mode 100644 dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/dataset_merge.json diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeEntitiesComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeEntitiesComparator.java new file mode 100644 index 000000000..5792fc10f --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeEntitiesComparator.java @@ -0,0 +1,104 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID; + +import java.util.*; +import java.util.stream.Collectors; + +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class MergeEntitiesComparator implements Comparator { + static final List PID_AUTHORITIES = Arrays + .asList( + ModelConstants.ARXIV_ID, + ModelConstants.PUBMED_CENTRAL_ID, + ModelConstants.EUROPE_PUBMED_CENTRAL_ID, + ModelConstants.DATACITE_ID, + ModelConstants.CROSSREF_ID); + + static final List RESULT_TYPES = Arrays + .asList( + ModelConstants.ORP_RESULTTYPE_CLASSID, + ModelConstants.SOFTWARE_RESULTTYPE_CLASSID, + ModelConstants.DATASET_RESULTTYPE_CLASSID, + ModelConstants.PUBLICATION_RESULTTYPE_CLASSID); + + public static final Comparator INSTANCE = new MergeEntitiesComparator(); + + @Override + public int compare(Oaf left, Oaf right) { + if (left == null && right == null) + return 0; + if (left == null) + return -1; + if (right == null) + return 1; + + int res = 0; + + // pid authority + int cfp1 = left + .getCollectedfrom() + .stream() + .map(kv -> PID_AUTHORITIES.indexOf(kv.getKey())) + .max(Integer::compare) + .orElse(-1); + int cfp2 = right + .getCollectedfrom() + .stream() + .map(kv -> PID_AUTHORITIES.indexOf(kv.getKey())) + .max(Integer::compare) + .orElse(-1); + + if (cfp1 >= 0 && cfp1 > cfp2) { + return 1; + } else if (cfp2 >= 0 && cfp2 > cfp1) { + return -1; + } + + // trust + if (left.getDataInfo() != null && right.getDataInfo() != null) { + res = left.getDataInfo().getTrust().compareTo(right.getDataInfo().getTrust()); + } + + // result type + if (res == 0) { + if (left instanceof Result && right instanceof Result) { + Result r1 = (Result) left; + Result r2 = (Result) right; + + if (r1.getResulttype() == null || r1.getResulttype().getClassid() == null) { + if (r2.getResulttype() != null && r2.getResulttype().getClassid() != null) { + return -1; + } + } else if (r2.getResulttype() == null || r2.getResulttype().getClassid() == null) { + return 1; + } + + int rt1 = RESULT_TYPES.indexOf(r1.getResulttype().getClassid()); + int rt2 = RESULT_TYPES.indexOf(r2.getResulttype().getClassid()); + + if (rt1 >= 0 && rt1 > rt2) { + return 1; + } else if (rt2 >= 0 && rt2 > rt1) { + return -1; + } + } + } + + // id + if (res == 0) { + if (left instanceof OafEntity && right instanceof OafEntity) { + res = ((OafEntity) left).getId().compareTo(((OafEntity) right).getId()); + } + } + + return res; + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java index 28db94766..f1221add3 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java @@ -40,27 +40,12 @@ public class MergeUtils { public static T mergeGroup(String s, Iterator oafEntityIterator, boolean checkDelegateAuthority) { - TreeSet sortedEntities = new TreeSet<>((o1, o2) -> { - int res = 0; - if (o1.getDataInfo() != null && o2.getDataInfo() != null) { - res = o1.getDataInfo().getTrust().compareTo(o2.getDataInfo().getTrust()); - } + ArrayList sortedEntities = new ArrayList<>(); + oafEntityIterator.forEachRemaining(sortedEntities::add); + sortedEntities.sort(MergeEntitiesComparator.INSTANCE.reversed()); - if (res == 0) { - if (o1 instanceof Result && o2 instanceof Result) { - return ResultTypeComparator.INSTANCE.compare((Result) o1, (Result) o2); - } - } - - return res; - }); - - while (oafEntityIterator.hasNext()) { - sortedEntities.add(oafEntityIterator.next()); - } - - Iterator it = sortedEntities.descendingIterator(); + Iterator it = sortedEntities.iterator(); T merged = it.next(); while (it.hasNext()) { @@ -143,7 +128,7 @@ public class MergeUtils { * https://graph.openaire.eu/docs/data-model/pids-and-identifiers#delegated-authorities and in that case it prefers * such version. *

- * Otherwise, it considers a resulttype priority order implemented in {@link ResultTypeComparator} + * Otherwise, it considers a resulttype priority order implemented in {@link MergeEntitiesComparator} * and proceeds with the canonical property merging. * * @param left @@ -161,8 +146,9 @@ public class MergeUtils { if (!leftFromDelegatedAuthority && rightFromDelegatedAuthority) { return right; } + // TODO: raise trust to have preferred fields from one or the other?? - if (new ResultTypeComparator().compare(left, right) < 0) { + if (MergeEntitiesComparator.INSTANCE.compare(left, right) > 0) { return mergeResultFields(left, right); } else { return mergeResultFields(right, left); @@ -225,9 +211,9 @@ public class MergeUtils { private static List mergeLists(final List left, final List right, int trust, Function keyExtractor, BinaryOperator merger) { - if (left == null) { - return right; - } else if (right == null) { + if (left == null || left.isEmpty()) { + return right != null ? right : new ArrayList<>(); + } else if (right == null || right.isEmpty()) { return left; } @@ -405,7 +391,7 @@ public class MergeUtils { } // should be an instance attribute, get the first non-null value - merge.setLanguage(coalesce(merge.getLanguage(), enrich.getLanguage())); + merge.setLanguage(coalesceQualifier(merge.getLanguage(), enrich.getLanguage())); // distinct countries, do not manage datainfo merge.setCountry(mergeQualifiers(merge.getCountry(), enrich.getCountry(), trust)); @@ -575,6 +561,13 @@ public class MergeUtils { return m != null ? m : e; } + private static Qualifier coalesceQualifier(Qualifier m, Qualifier e) { + if (m == null || m.getClassid() == null || StringUtils.isBlank(m.getClassid())) { + return e; + } + return m; + } + private static List mergeAuthors(List author, List author1, int trust) { List> authors = new ArrayList<>(); if (author != null) { @@ -587,6 +580,10 @@ public class MergeUtils { } private static String instanceKeyExtractor(Instance i) { + // three levels of concatenating: + // 1. :: + // 2. @@ + // 3. || return String .join( "::", @@ -594,10 +591,10 @@ public class MergeUtils { kvKeyExtractor(i.getCollectedfrom()), qualifierKeyExtractor(i.getAccessright()), qualifierKeyExtractor(i.getInstancetype()), - Optional.ofNullable(i.getUrl()).map(u -> String.join("::", u)).orElse(null), + Optional.ofNullable(i.getUrl()).map(u -> String.join("@@", u)).orElse(null), Optional .ofNullable(i.getPid()) - .map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("::"))) + .map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("@@"))) .orElse(null)); } @@ -706,7 +703,7 @@ public class MergeUtils { private static String spKeyExtractor(StructuredProperty sp) { return Optional .ofNullable(sp) - .map(s -> Joiner.on("::").join(s, qualifierKeyExtractor(s.getQualifier()))) + .map(s -> Joiner.on("||").join(qualifierKeyExtractor(s.getQualifier()), s.getValue())) .orElse(null); } diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DatasetMergerTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DatasetMergerTest.java new file mode 100644 index 000000000..726814c43 --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DatasetMergerTest.java @@ -0,0 +1,103 @@ + +package eu.dnetlib.dhp.oa.dedup; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.io.Serializable; +import java.lang.reflect.InvocationTargetException; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; + +import org.codehaus.jackson.map.ObjectMapper; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils; +import eu.dnetlib.pace.util.MapDocumentUtil; +import scala.Tuple2; + +class DatasetMergerTest implements Serializable { + + private List> datasets; + + private String testEntityBasePath; + private DataInfo dataInfo; + private final String dedupId = "50|doi_________::3d18564ef27ebe9ef3bd8b4dec67e148"; + private Dataset dataset_top; + + @BeforeEach + public void setUp() throws Exception { + testEntityBasePath = Paths + .get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI()) + .toFile() + .getAbsolutePath(); + + datasets = readSample(testEntityBasePath + "/dataset_merge.json", Dataset.class); + + dataset_top = getTopPub(datasets); + + dataInfo = setDI(); + } + + @Test + void datasetMergerTest() throws InstantiationException, IllegalAccessException, InvocationTargetException { + Dataset pub_merged = MergeUtils.mergeGroup(dedupId, datasets.stream().map(Tuple2::_2).iterator()); + + // verify id + assertEquals(dedupId, pub_merged.getId()); + assertEquals(2, pub_merged.getInstance().size()); + } + + public DataInfo setDI() { + DataInfo dataInfo = new DataInfo(); + dataInfo.setTrust("0.9"); + dataInfo.setDeletedbyinference(false); + dataInfo.setInferenceprovenance("testing"); + dataInfo.setInferred(true); + return dataInfo; + } + + public Dataset getTopPub(List> publications) { + + Double maxTrust = 0.0; + Dataset maxPub = new Dataset(); + for (Tuple2 publication : publications) { + Double pubTrust = Double.parseDouble(publication._2().getDataInfo().getTrust()); + if (pubTrust > maxTrust) { + maxTrust = pubTrust; + maxPub = publication._2(); + } + } + return maxPub; + } + + public List> readSample(String path, Class clazz) { + List> res = new ArrayList<>(); + BufferedReader reader; + try { + reader = new BufferedReader(new FileReader(path)); + String line = reader.readLine(); + while (line != null) { + res + .add( + new Tuple2<>( + MapDocumentUtil.getJPathString("$.id", line), + new ObjectMapper().readValue(line, clazz))); + // read next line + line = reader.readLine(); + } + reader.close(); + } catch (IOException e) { + e.printStackTrace(); + } + + return res; + } + +} diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java index 4a5a3bd1b..995407edb 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java @@ -93,14 +93,14 @@ class EntityMergerTest implements Serializable { assertEquals(pub_top.getJournal().getConferencedate(), pub_merged.getJournal().getConferencedate()); assertEquals(pub_top.getJournal().getConferenceplace(), pub_merged.getJournal().getConferenceplace()); assertEquals("OPEN", pub_merged.getBestaccessright().getClassid()); - assertEquals(pub_top.getResulttype(), pub_merged.getResulttype()); - assertEquals(pub_top.getLanguage(), pub_merged.getLanguage()); - assertEquals(pub_top.getPublisher(), pub_merged.getPublisher()); - assertEquals(pub_top.getEmbargoenddate(), pub_merged.getEmbargoenddate()); + assertEquals(pub_top.getResulttype().getClassid(), pub_merged.getResulttype().getClassid()); + assertEquals(pub_top.getLanguage().getClassid(), pub_merged.getLanguage().getClassid()); + assertEquals("Elsevier BV", pub_merged.getPublisher().getValue()); + assertEquals(pub_top.getEmbargoenddate().getValue(), pub_merged.getEmbargoenddate().getValue()); assertEquals(pub_top.getResourcetype().getClassid(), ""); assertEquals(pub_top.getDateoftransformation(), pub_merged.getDateoftransformation()); assertEquals(pub_top.getOaiprovenance(), pub_merged.getOaiprovenance()); - assertEquals(pub_top.getDateofcollection(), pub_merged.getDateofcollection()); + // assertEquals(pub_top.getDateofcollection(), pub_merged.getDateofcollection()); assertEquals(3, pub_merged.getInstance().size()); assertEquals(2, pub_merged.getCountry().size()); assertEquals(0, pub_merged.getSubject().size()); diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/dataset_merge.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/dataset_merge.json new file mode 100644 index 000000000..86d1a8133 --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/dataset_merge.json @@ -0,0 +1,2 @@ +{"publisher": {"value": "DANS Data Station Archaeology"}, "dateofcollection": "2024-04-30T12:49:55+0000", "dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": true}, "bestaccessright": {"classid": "UNKNOWN", "classname": "not available", "schemeid": "dnet:access_modes", "schemename": "dnet:access_modes"}, "language": {"classid": "und", "classname": "Undetermined", "schemeid": "dnet:languages", "schemename": "dnet:languages"}, "author": [{"affiliation": [{"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "value": "(Geonius)"}], "fullname": "S.A.H. Augustin", "pid": [], "rank": 1}, {"affiliation": [{"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "value": "(Geonius)"}], "fullname": "J.J.G. Geraeds", "pid": [], "rank": 2}], "resourcetype": {"classid": "UNKNOWN", "classname": "Unknown", "schemeid": "dnet:dataCite_resource", "schemename": "dnet:dataCite_resource"}, "originalId": ["50|datacite____::3d18564ef27ebe9ef3bd8b4dec67e148", "10.17026/ar/vt9ya1"], "description": [{"value": "In opdracht van Gemeente Beesel heeft Geonius Archeologie in maart 2023 een Inventariserend Veldonderzoek door middel van Proefsleuven (IVO-P) uitgevoerd voor het plangebied Heijackerstraat te Beesel in de gemeente Beesel. Aanleiding voor het uitvoeren van het archeologisch onderzoek vormt de aanvraag van een omgevingsvergunning voor bouw van 20 nieuwe woningen. Uit het vooronderzoek is gebleken dat het plangebied in een dalvakteterras ligt rondom opgestoven landduinen langsheen de Maas. De bodem bestaat volgens de bodemkaart uit vorstvaaggronden. Het plangebied is in het verleden voor zover kon worden vastgesteld in gebruik geweest als bouwland en is niet bebouwd geweest. Het IVO-O heeft uitgewezen dat de bodemopbouw deels intact is, al lijken in sommige boringen sporen van vergravingen of verstoringen te bevatten. Op grond van de resultaten van het vooronderzoek is een hoge verwachting opgesteld op het voorkomen van archeologische waarden uit het paleolithicum tot aan de vroege middeleeuwen. Voor de periode late middeleeuwen en nieuwe tijd is een lage verwachting opgesteld. Op grond van de resultaten van het vooronderzoek is een IVO-P uitgevoerd. Hierbij is een vindplaats aangetroffen bestaande uit drie subrecente greppels en een tweetal recente verstoringen. De vindplaats is als niet behoudenswaardig gewaardeerd. Aanbevolen is het plangebied vrij te geven voor de geplande ontwikkeling."}], "title": [{"qualifier": {"classid": "main title", "classname": "main title", "schemeid": "dnet:dataCite_title", "schemename": "dnet:dataCite_title"}, "value": "Archeologisch onderzoek IVO-P plangebied Heijackerstraat te Beesel"}], "pid": [{"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemeid": "dnet:pid_types", "schemename": "dnet:pid_types"}, "value": "10.17026/ar/vt9ya1"}], "id": "50|doi_________::3d18564ef27ebe9ef3bd8b4dec67e148", "instance": [{"refereed": {"classid": "0002", "classname": "nonPeerReviewed", "schemeid": "dnet:review_levels", "schemename": "dnet:review_levels"}, "hostedby": {"key": "10|re3data_____::84e123776089ce3c7a33db98d9cd15a8", "value": "EASY"}, "url": ["https://dx.doi.org/10.17026/ar/vt9ya1"], "pid": [{"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemeid": "dnet:pid_types", "schemename": "dnet:pid_types"}, "value": "10.17026/ar/vt9ya1"}], "instanceTypeMapping": [{"originalType": "Dataset", "typeLabel": "dataset", "vocabularyName": "openaire::coar_resource_types_3_1", "typeCode": "http://purl.org/coar/resource_type/c_ddb1"}], "dateofacceptance": {"value": "2024-01-01"}, "collectedfrom": {"key": "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254", "value": "Datacite"}, "accessright": {"classid": "UNKNOWN", "classname": "not available", "schemeid": "dnet:access_modes", "schemename": "dnet:access_modes"}, "instancetype": {"classid": "0021", "classname": "Dataset", "schemeid": "dnet:publication_resource", "schemename": "dnet:publication_resource"}}], "relevantdate": [{"qualifier": {"classid": "issued", "classname": "issued", "schemeid": "dnet:dataCite_date", "schemename": "dnet:dataCite_date"}, "value": "2024-01-01"}], "resulttype": {"classid": "dataset", "classname": "dataset", "schemeid": "dnet:result_typologies", "schemename": "dnet:result_typologies"}, "context": [], "collectedfrom": [{"key": "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254", "value": "Datacite"}], "dateoftransformation": "2024-04-30T12:49:55+0000", "subject": [], "dateofacceptance": {"value": "2024-01-01"}, "metaResourceType": {"classid": "Research Data", "classname": "Research Data", "schemeid": "openaire::meta_resource_types", "schemename": "openaire::meta_resource_types"}} +{"geolocation": [{"box": "", "place": "", "point": ""}], "dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": true}, "resourcetype": {"classid": "dataset", "classname": "dataset", "schemeid": "dnet:dataCite_resource", "schemename": "dnet:dataCite_resource"}, "pid": [], "contributor": [{"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "value": "Geonius"}], "oaiprovenance": {"originDescription": {"metadataNamespace": "", "harvestDate": "2024-05-05T04:33:31Z", "baseURL": "https://easy.dans.knaw.nl/oai", "datestamp": "", "altered": true, "identifier": "oai:easy.dans.knaw.nl:easy-dataset:341200"}}, "bestaccessright": {"classid": "OPEN", "classname": "Open Access", "schemeid": "dnet:access_modes", "schemename": "dnet:access_modes"}, "relevantdate": [], "collectedfrom": [{"key": "10|re3data_____::730f562f9efe8a3b3742d2da510d4335", "value": "B2FIND"}], "id": "50|r3730f562f9e::ace629fb505b6b4343faca03edde1841", "subject": [{"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "Ancient Cultures"}, {"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "Humanities"}, {"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "Archaeology"}], "lastupdatetimestamp": 1716803651625, "author": [{"surname": "Augustin", "name": "S. A. H.", "pid": [], "rank": 1, "affiliation": [], "fullname": "S.A.H. Augustin"}, {"surname": "Geraeds", "name": "J. J. G.", "pid": [], "rank": 2, "affiliation": [], "fullname": "J.J.G. Geraeds"}], "instance": [{"refereed": {"classid": "0002", "classname": "nonPeerReviewed", "schemeid": "dnet:review_levels", "schemename": "dnet:review_levels"}, "hostedby": {"key": "10|re3data_____::84e123776089ce3c7a33db98d9cd15a8", "value": "DANS-EASY"}, "url": ["http://dx.doi.org/https://doi.org/10.17026/AR/VT9YA1"], "pid": [], "instanceTypeMapping": [{"originalType": "Dataset", "typeLabel": "dataset", "vocabularyName": "openaire::coar_resource_types_3_1", "typeCode": "http://purl.org/coar/resource_type/c_ddb1"}], "alternateIdentifier": [{"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemeid": "dnet:pid_types", "schemename": "dnet:pid_types"}, "value": "10.17026/ar/vt9ya1"}, {"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemeid": "dnet:pid_types", "schemename": "dnet:pid_types"}, "value": "10.17026/ar/vt9ya1"}, {"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemeid": "dnet:pid_types", "schemename": "dnet:pid_types"}, "value": "10.17026/ar/vt9ya1"}], "dateofacceptance": {"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "value": "2024-01-01"}, "collectedfrom": {"key": "10|re3data_____::730f562f9efe8a3b3742d2da510d4335", "value": "B2FIND"}, "accessright": {"classid": "OPEN", "classname": "Open Access", "schemeid": "dnet:access_modes", "schemename": "dnet:access_modes"}, "instancetype": {"classid": "0021", "classname": "Dataset", "schemeid": "dnet:publication_resource", "schemename": "dnet:publication_resource"}}], "dateofcollection": "2024-05-10T00:02:09+0000", "metaResourceType": {"classid": "Research Data", "classname": "Research Data", "schemeid": "openaire::meta_resource_types", "schemename": "openaire::meta_resource_types"}, "dateoftransformation": "2024-05-26T00:23:54.028Z", "description": [{"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "value": "In opdracht van Gemeente Beesel heeft Geonius Archeologie in maart 2023 een Inventariserend Veldonderzoek door middel van Proefsleuven (IVO-P) uitgevoerd voor het plangebied Heijackerstraat te Beesel in de gemeente Beesel. Aanleiding voor het uitvoeren van het archeologisch onderzoek vormt de aanvraag van een omgevingsvergunning voor bouw van 20 nieuwe woningen. Uit het vooronderzoek is gebleken dat het plangebied in een dalvakteterras ligt rondom opgestoven landduinen langsheen de Maas. De bodem bestaat volgens de bodemkaart uit vorstvaaggronden. Het plangebied is in het verleden voor zover kon worden vastgesteld in gebruik geweest als bouwland en is niet bebouwd geweest. Het IVO-O heeft uitgewezen dat de bodemopbouw deels intact is, al lijken in sommige boringen sporen van vergravingen of verstoringen te bevatten. Op grond van de resultaten van het vooronderzoek is een hoge verwachting opgesteld op het voorkomen van archeologische waarden uit het paleolithicum tot aan de vroege middeleeuwen. Voor de periode late middeleeuwen en nieuwe tijd is een lage verwachting opgesteld. Op grond van de resultaten van het vooronderzoek is een IVO-P uitgevoerd. Hierbij is een vindplaats aangetroffen bestaande uit drie subrecente greppels en een tweetal recente verstoringen. De vindplaats is als niet behoudenswaardig gewaardeerd. Aanbevolen is het plangebied vrij te geven voor de geplande ontwikkeling."}], "format": [], "coverage": [], "externalReference": [], "publisher": {"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "value": "Data Archiving and Networked Services (DANS)"}, "context": [], "eoscifguidelines": [], "language": {"classid": "und", "classname": "Undetermined", "schemeid": "dnet:languages", "schemename": "dnet:languages"}, "resulttype": {"classid": "dataset", "classname": "dataset", "schemeid": "dnet:result_typologies", "schemename": "dnet:result_typologies"}, "country": [], "extraInfo": [], "originalId": ["oai:easy.dans.knaw.nl:easy-dataset:341200", "50|r3730f562f9e::ace629fb505b6b4343faca03edde1841"], "source": [], "dateofacceptance": {"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "value": "2024-01-01"}, "title": [{"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "main title", "classname": "main title", "schemeid": "dnet:dataCite_title", "schemename": "dnet:dataCite_title"}, "value": "Archeologisch onderzoek IVO-P plangebied Heijackerstraat te Beesel"}]} \ No newline at end of file From 92c3abd5a4e969cc47313b851f2f93f5c7dcfc87 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 29 May 2024 14:36:49 +0200 Subject: [PATCH 18/24] [graph cleaning] use sparkExecutorMemory to define also the memoryOverhead --- .../eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml index f6bf053cd..4188cb018 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml @@ -156,6 +156,7 @@ --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} @@ -190,6 +191,7 @@ --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} @@ -224,6 +226,7 @@ --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} @@ -258,6 +261,7 @@ --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} @@ -292,6 +296,7 @@ --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} @@ -326,6 +331,7 @@ --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} @@ -360,6 +366,7 @@ --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} @@ -394,6 +401,7 @@ --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} From 1477406ecc06eafcc704c55e2a317a74e13fcf62 Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Wed, 5 Jun 2024 16:20:40 +0200 Subject: [PATCH 19/24] [bulkTag] fixed issue that made project disappear in graph_10_enriched --- .../src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java index 9e1acc7b2..354741690 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java @@ -172,7 +172,7 @@ public class SparkBulkTagJob { .option("compression", "gzip") .json(outputPath + "project"); - readPath(spark, outputPath + "project", Datasource.class) + readPath(spark, outputPath + "project", Project.class) .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") From ec79405cc9e4cbdadeb983be3b01408c259bc751 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 7 Jun 2024 11:30:31 +0200 Subject: [PATCH 20/24] [graph raw] set organization type from openorgs --- .../dhp/oa/graph/raw/MigrateDbEntitiesApplication.java | 1 + .../dnetlib/dhp/oa/graph/sql/queryOpenOrgsForProvision.sql | 7 +++++-- pom.xml | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index da6885db3..c9a32cde6 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -398,6 +398,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i o.setEcsmevalidated(field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info)); o.setEcnutscode(field(Boolean.toString(rs.getBoolean("ecnutscode")), info)); o.setCountry(prepareQualifierSplitting(rs.getString("country"))); + o.setOrganizationType(Organization.OrganizationType.valueOf(rs.getString("typology"))); o.setDataInfo(info); o.setLastupdatetimestamp(lastUpdateTimestamp); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOpenOrgsForProvision.sql b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOpenOrgsForProvision.sql index 0ec303939..16ad9e265 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOpenOrgsForProvision.sql +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOpenOrgsForProvision.sql @@ -28,7 +28,8 @@ SELECT (array_remove(array_cat(ARRAY[o.ec_internationalorganization], array_agg(od.ec_internationalorganization)), NULL))[1] AS ecinternationalorganization, (array_remove(array_cat(ARRAY[o.ec_enterprise], array_agg(od.ec_enterprise)), NULL))[1] AS ecenterprise, (array_remove(array_cat(ARRAY[o.ec_smevalidated], array_agg(od.ec_smevalidated)), NULL))[1] AS ecsmevalidated, - (array_remove(array_cat(ARRAY[o.ec_nutscode], array_agg(od.ec_nutscode)), NULL))[1] AS ecnutscode + (array_remove(array_cat(ARRAY[o.ec_nutscode], array_agg(od.ec_nutscode)), NULL))[1] AS ecnutscode, + org_types.name AS typology FROM organizations o LEFT OUTER JOIN acronyms a ON (a.id = o.id) LEFT OUTER JOIN urls u ON (u.id = o.id) @@ -37,6 +38,7 @@ FROM organizations o LEFT OUTER JOIN oa_duplicates d ON (o.id = d.local_id AND d.reltype != 'is_different') LEFT OUTER JOIN organizations od ON (d.oa_original_id = od.id) LEFT OUTER JOIN other_ids idup ON (od.id = idup.id) + LEFT OUTER JOIN org_types ON (org_types.val = o.type) WHERE o.status = 'approved' OR o.status = 'suggested' GROUP BY @@ -44,4 +46,5 @@ GROUP BY o.name, o.creation_date, o.modification_date, - o.country; \ No newline at end of file + o.country, + org_types.name; \ No newline at end of file diff --git a/pom.xml b/pom.xml index cc8d509f7..9e554204d 100644 --- a/pom.xml +++ b/pom.xml @@ -960,7 +960,7 @@ 1.1.3 1.7 1.0.7 - [6.1.2] + [6.1.3-SNAPSHOT] cdh5.9.2 3.5 11.0.2 From c7265724182b7747540e69eaec2732799dca2d0f Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Fri, 7 Jun 2024 12:03:26 +0200 Subject: [PATCH 21/24] changed some parameters in OSF test --- .../dhp/collection/plugin/rest/OsfPreprintCollectorTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java index 0e64f8bab..a1b723e33 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java @@ -36,8 +36,8 @@ public class OsfPreprintCollectorTest { private final String resultTotalXpath = "/*/*[local-name()='links']/*[local-name()='meta']/*[local-name()='total']"; private final String resumptionParam = "page"; - private final String resumptionType = "page"; - private final String resumptionXpath = "/*/*[local-name()='links']/*[local-name()='next']"; + private final String resumptionType = "scan"; + private final String resumptionXpath = "substring-before(substring-after(/*/*[local-name()='links']/*[local-name()='next'], 'page='), '&')"; private final String resultSizeParam = "page[size]"; private final String resultSizeValue = "100"; From 3776327a8cc3e725af8af49255598681eac83d1f Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 10 Jun 2024 15:22:33 +0200 Subject: [PATCH 22/24] hostedby patching to work with the updated Crossref contents, resolved conflict --- .../SparkApplyHostedByMapToResult.scala | 39 ++++++++++++------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala index a900fc241..db7edf53e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala @@ -25,27 +25,38 @@ object SparkApplyHostedByMapToResult { val i = p.getInstance().asScala if (i.size == 1) { val inst: Instance = i.head - inst.getHostedby.setKey(ei.getHostedById) - inst.getHostedby.setValue(ei.getName) - if (ei.getOpenAccess) { - inst.setAccessright( - OafMapperUtils.accessRight( - ModelConstants.ACCESS_RIGHT_OPEN, - "Open Access", - ModelConstants.DNET_ACCESS_MODES, - ModelConstants.DNET_ACCESS_MODES - ) - ) - inst.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold) - p.setBestaccessright(OafMapperUtils.createBestAccessRights(p.getInstance())); - } + patchInstance(p, ei, inst) + } else { + val cf = i.map(ii => ii.getCollectedfrom.getValue) + if (cf.contains("Crossref")) { + i.foreach(ii => { + patchInstance(p, ei, ii) + }) + } } } p })(Encoders.bean(classOf[Publication])) } + private def patchInstance(p: Publication, ei: EntityInfo, inst: Instance): Unit = { + inst.getHostedby.setKey(ei.getHostedById) + inst.getHostedby.setValue(ei.getName) + if (ei.getOpenAccess) { + inst.setAccessright( + OafMapperUtils.accessRight( + ModelConstants.ACCESS_RIGHT_OPEN, + "Open Access", + ModelConstants.DNET_ACCESS_MODES, + ModelConstants.DNET_ACCESS_MODES + ) + ) + inst.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold) + p.setBestaccessright(OafMapperUtils.createBestAccessRights(p.getInstance())); + } + } + def main(args: Array[String]): Unit = { val logger: Logger = LoggerFactory.getLogger(getClass) From b0eba210c068219580cfa78c17aa23f2e1e170f8 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 10 Jun 2024 16:15:07 +0200 Subject: [PATCH 23/24] [actionset promotion] use sparkExecutorMemory to define also the memoryOverhead --- dhp-shade-package/dependency-reduced-pom.xml | 113 ++++++++++++++++++ .../wf/dataset/oozie_app/workflow.xml | 2 + .../wf/datasource/oozie_app/workflow.xml | 1 + .../wf/main/oozie_app/workflow.xml | 1 + .../wf/organization/oozie_app/workflow.xml | 1 + .../oozie_app/workflow.xml | 4 +- .../wf/project/oozie_app/workflow.xml | 1 + .../wf/publication/oozie_app/workflow.xml | 6 +- .../wf/relation/oozie_app/workflow.xml | 3 +- .../wf/software/oozie_app/workflow.xml | 4 +- 10 files changed, 131 insertions(+), 5 deletions(-) create mode 100644 dhp-shade-package/dependency-reduced-pom.xml diff --git a/dhp-shade-package/dependency-reduced-pom.xml b/dhp-shade-package/dependency-reduced-pom.xml new file mode 100644 index 000000000..04843072f --- /dev/null +++ b/dhp-shade-package/dependency-reduced-pom.xml @@ -0,0 +1,113 @@ + + + + dhp + eu.dnetlib.dhp + 1.2.5-SNAPSHOT + + 4.0.0 + dhp-shade-package + This module create a jar of all module dependencies + + + + maven-shade-plugin + + + package + + shade + + + + + eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels + + + + META-INF/cxf/bus-extensions.txt + + + + + *:* + + META-INF/maven/** + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + com + repackaged.com.google.common + + com.google.common.** + + + + + + + + + + + + org.projectlombok + lombok + 1.18.28 + provided + + + org.junit.jupiter + junit-jupiter + 5.6.1 + test + + + junit-jupiter-api + org.junit.jupiter + + + junit-jupiter-params + org.junit.jupiter + + + junit-jupiter-engine + org.junit.jupiter + + + + + org.mockito + mockito-core + 3.3.3 + test + + + byte-buddy + net.bytebuddy + + + byte-buddy-agent + net.bytebuddy + + + + + org.mockito + mockito-junit-jupiter + 3.3.3 + test + + + + + DHPSite + ${dhp.site.stage.path}/dhp-common + + + diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml index 5401b45ca..b1bc1d6e1 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml @@ -103,6 +103,7 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} @@ -156,6 +157,7 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/datasource/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/datasource/oozie_app/workflow.xml index f9bd66ae3..9a84f4708 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/datasource/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/datasource/oozie_app/workflow.xml @@ -95,6 +95,7 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml index 393f04e89..65ddd402b 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml @@ -125,6 +125,7 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/organization/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/organization/oozie_app/workflow.xml index ebfdeee31..0d73b498d 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/organization/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/organization/oozie_app/workflow.xml @@ -95,6 +95,7 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml index 02399ed9b..ca8362c9b 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml @@ -103,6 +103,7 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} @@ -155,11 +156,12 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=2560 + --conf spark.sql.shuffle.partitions=8000 --inputGraphTablePath${workingDir}/otherresearchproduct --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/project/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/project/oozie_app/workflow.xml index 57c2357b4..37310da79 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/project/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/project/oozie_app/workflow.xml @@ -95,6 +95,7 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml index 92b114776..a4b0b237c 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml @@ -103,11 +103,12 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7000 + --conf spark.sql.shuffle.partitions=15000 --inputGraphTablePath${inputGraphRootPath}/publication --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication @@ -156,11 +157,12 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7000 + --conf spark.sql.shuffle.partitions=15000 --inputGraphTablePath${workingDir}/publication --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml index e9e5f0b45..f72847ba8 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml @@ -95,11 +95,12 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=10000 + --conf spark.sql.shuffle.partitions=15000 --inputGraphTablePath${inputGraphRootPath}/relation --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Relation diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml index 1d36ddf94..dbe8a63c1 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml @@ -103,6 +103,7 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} @@ -155,11 +156,12 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=2560 + --conf spark.sql.shuffle.partitions=4000 --inputGraphTablePath${workingDir}/software --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software From 85c1eae7e0da86e25aa2da74d90ac82ecf150e27 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 10 Jun 2024 19:03:30 +0200 Subject: [PATCH 24/24] Fixes for pagination strategy looping at end of download --- .../collection/plugin/rest/RestIterator.java | 58 +++++++++++++------ 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java index 2518fd92f..9037a454e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java @@ -12,6 +12,8 @@ import java.util.Iterator; import java.util.Map; import java.util.Queue; import java.util.concurrent.PriorityBlockingQueue; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; @@ -19,16 +21,10 @@ import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; -import javax.xml.xpath.XPath; -import javax.xml.xpath.XPathConstants; -import javax.xml.xpath.XPathExpression; -import javax.xml.xpath.XPathExpressionException; -import javax.xml.xpath.XPathFactory; +import javax.xml.xpath.*; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; -import org.apache.http.HttpHeaders; -import org.apache.http.entity.ContentType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Node; @@ -51,7 +47,6 @@ import eu.dnetlib.dhp.common.collection.HttpClientParams; * */ public class RestIterator implements Iterator { - private static final Logger log = LoggerFactory.getLogger(RestIterator.class); public static final String UTF_8 = "UTF-8"; private static final int MAX_ATTEMPTS = 5; @@ -60,11 +55,15 @@ public class RestIterator implements Iterator { private final String AUTHBASIC = "basic"; + private static final String XML_HEADER = ""; + private static final String EMPTY_XML = XML_HEADER + "<" + JsonUtils.XML_WRAP_TAG + ">"; + private final String baseUrl; private final String resumptionType; private final String resumptionParam; private final String resultFormatValue; - private String queryParams; + private String queryParams = ""; private final int resultSizeValue; private int resumptionInt = 0; // integer resumption token (first record to harvest) private int resultTotal = -1; @@ -158,7 +157,12 @@ public class RestIterator implements Iterator { } private void initQueue() { - this.query = this.baseUrl + "?" + this.queryParams + this.querySize + this.queryFormat; + if (queryParams.equals("") && querySize.equals("") && queryFormat.equals("")) { + query = baseUrl; + } else { + query = baseUrl + "?" + queryParams + querySize + queryFormat; + } + log.info("REST calls starting with {}", this.query); } @@ -219,9 +223,8 @@ public class RestIterator implements Iterator { try { String resultJson; - String resultXml = ""; + String resultXml = XML_HEADER; String nextQuery = ""; - final String emptyXml = resultXml + "<" + JsonUtils.XML_WRAP_TAG + ">"; Node resultNode = null; NodeList nodeList = null; String qUrlArgument = ""; @@ -236,6 +239,21 @@ public class RestIterator implements Iterator { } } + // find pagination page start number in queryParam and remove before start the first query + if ((resumptionType.toLowerCase().equals("pagination") || resumptionType.toLowerCase().equals("page")) + && (query.contains("paginationStart="))) { + + final Matcher m = Pattern.compile("paginationStart=([0-9]+)").matcher(query); + m.find(); // guaranteed to be true for this regex + + String[] pageVal = m.group(0).split("="); + pagination = Integer.parseInt(pageVal[1]); + + // remove page start number from queryParams + query = query.replaceFirst("&?paginationStart=[0-9]+", ""); + + } + try { log.info("requesting URL [{}]", query); @@ -261,7 +279,7 @@ public class RestIterator implements Iterator { this.resultStream = IOUtils.toInputStream(resultXml, UTF_8); } - if (!(emptyXml).equalsIgnoreCase(resultXml)) { + if (!isEmptyXml(resultXml)) { resultNode = (Node) this.xpath .evaluate("/", new InputSource(this.resultStream), XPathConstants.NODE); nodeList = (NodeList) this.xprEntity.evaluate(resultNode, XPathConstants.NODESET); @@ -270,8 +288,7 @@ public class RestIterator implements Iterator { final StringWriter sw = new StringWriter(); this.transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw)); final String toEnqueue = sw.toString(); - if ((toEnqueue == null) || StringUtils.isBlank(toEnqueue) - || emptyXml.equalsIgnoreCase(toEnqueue)) { + if ((toEnqueue == null) || StringUtils.isBlank(toEnqueue) || isEmptyXml(toEnqueue)) { log .warn( "The following record resulted in empty item for the feeding queue: {}", resultXml); @@ -299,6 +316,7 @@ public class RestIterator implements Iterator { throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2"); } qUrlArgument = qUrl.getQuery(); + final String[] arrayQUrlArgument = qUrlArgument.split("&"); for (final String arrayUrlArgStr : arrayQUrlArgument) { if (arrayUrlArgStr.startsWith(this.resumptionParam)) { @@ -312,7 +330,7 @@ public class RestIterator implements Iterator { } } - if (((emptyXml).equalsIgnoreCase(resultXml)) + if (isEmptyXml(resultXml) || ((nodeList != null) && (nodeList.getLength() < this.resultSizeValue))) { // resumptionStr = ""; if (nodeList != null) { @@ -331,13 +349,13 @@ public class RestIterator implements Iterator { case "pagination": case "page": // pagination, iterate over page numbers - this.pagination += 1; - if (nodeList != null) { + if (nodeList != null && nodeList.getLength() > 0) { this.discoverResultSize += nodeList.getLength(); } else { this.resultTotal = this.discoverResultSize; this.pagination = this.discoverResultSize; } + this.pagination += 1; this.resumptionInt = this.pagination; this.resumptionStr = Integer.toString(this.resumptionInt); break; @@ -415,6 +433,10 @@ public class RestIterator implements Iterator { } + private boolean isEmptyXml(String s) { + return EMPTY_XML.equalsIgnoreCase(s); + } + private boolean isInteger(final String s) { boolean isValidInteger = false; try {