From d64a942a76b240fd7529a708bac97a7d233258cb Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 9 Aug 2021 12:32:26 +0200 Subject: [PATCH 01/51] fixed MappersTest --- .../test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index c121dee2a..c431b4dd8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -726,8 +726,8 @@ public class MappersTest { final Dataset p = (Dataset) list.get(0); assertValidId(p.getId()); - assertTrue(p.getOriginalId().size() == 1); - assertEquals("df76e73f-0483-49a4-a9bb-63f2f985574a", p.getOriginalId().get(0)); + assertEquals(2, p.getOriginalId().size()); + assertTrue(p.getOriginalId().stream().anyMatch(oid -> oid.equals("df76e73f-0483-49a4-a9bb-63f2f985574a"))); assertValidId(p.getCollectedfrom().get(0).getKey()); assertTrue(p.getAuthor().size() > 0); From 370dddb2fa8c27fbd95b618e22a2be9fcc8ebf40 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Tue, 7 Sep 2021 11:20:41 +0200 Subject: [PATCH 02/51] fix bug on oai iterator that skip record cleaned --- .../java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java index 75dd746ea..4b254c0ef 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java @@ -161,7 +161,7 @@ public class OaiIterator implements Iterator { report.put(e.getClass().getName(), e.getMessage()); final String cleaned = XmlCleaner.cleanAllEntities(xml); try { - doc = DocumentHelper.parseText(xml); + doc = DocumentHelper.parseText(cleaned); } catch (final DocumentException e1) { final String resumptionToken = extractResumptionToken(xml); if (resumptionToken == null) { From ec94cc9b939d4cfe92643d0c1d14afe52e403559 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 1 Oct 2021 09:41:27 +0200 Subject: [PATCH 03/51] IndexNotificationsJob test: persist contents on HDFS instead of passing them to ES --- .../dhp/broker/oa/IndexNotificationsJob.java | 29 ++----------------- 1 file changed, 3 insertions(+), 26 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java index 80549e1ce..7b65c27ab 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java @@ -101,33 +101,10 @@ public class IndexNotificationsJob { ngEncoder) .flatMap((FlatMapFunction) g -> g.getData().iterator(), nEncoder); - final JavaRDD inputRdd = notifications + notifications .map((MapFunction) n -> prepareForIndexing(n, total), Encoders.STRING()) - .javaRDD(); - - final Map esCfg = new HashMap<>(); - // esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54"); - - esCfg.put("es.index.auto.create", "false"); - esCfg.put("es.nodes", indexHost); - esCfg.put("es.mapping.id", "notificationId"); // THE PRIMARY KEY - esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount); - esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait); - esCfg.put("es.batch.size.entries", esBatchSizeEntries); - esCfg.put("es.nodes.wan.only", esNodesWanOnly); - - log.info("*** Start indexing"); - JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg); - log.info("*** End indexing"); - - log.info("*** Deleting old notifications"); - final String message = deleteOldNotifications(brokerApiBaseUrl, startTime - 1000); - log.info("*** Deleted notifications: " + message); - - log.info("*** sendNotifications (emails, ...)"); - sendNotifications(brokerApiBaseUrl, startTime - 1000); - log.info("*** ALL done."); - + .javaRDD() + .saveAsTextFile("/tmp/IndexNotificationsJob_test_6504"); } } From b01cd521b0eede2ad6ab8ab95e22f6a3dd14cae1 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 1 Oct 2021 11:26:33 +0200 Subject: [PATCH 04/51] removed configuration specifying the limit to 8 for spark.dynamicAllocation.maxExecutors --- .../dhp/broker/oa/notifications_only/oozie_app/workflow.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml index 248326d57..70ab7d344 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml @@ -135,7 +135,6 @@ --executor-memory=${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} - --conf spark.dynamicAllocation.maxExecutors="8" --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} From 31a6ad1d79cea595d85a1ef9a15fc6ea714a97f3 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Mon, 4 Oct 2021 12:01:56 +0200 Subject: [PATCH 05/51] optimization of verifySubsriptions() --- .../dhp/broker/oa/IndexNotificationsJob.java | 39 +++++---- .../broker/oa/IndexNotificationsJobTest.java | 84 +++++++++++++++++++ 2 files changed, 105 insertions(+), 18 deletions(-) create mode 100644 dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJobTest.java diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java index 7b65c27ab..c66e5f4f4 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java @@ -17,7 +17,6 @@ import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; @@ -25,7 +24,6 @@ import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import org.apache.spark.util.LongAccumulator; -import org.elasticsearch.spark.rdd.api.java.JavaEsSpark; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,13 +44,14 @@ public class IndexNotificationsJob { private static final Logger log = LoggerFactory.getLogger(IndexNotificationsJob.class); + private static Map>> conditionsForSubscriptions = new HashMap<>(); + public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils - .toString( - IndexNotificationsJob.class - .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/index_notifications.json"))); + .toString(IndexNotificationsJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/index_notifications.json"))); parser.parseArgument(args); final SparkConf conf = new SparkConf(); @@ -89,6 +88,8 @@ public class IndexNotificationsJob { final List subscriptions = listSubscriptions(brokerApiBaseUrl); + initConditionsForSubscriptions(subscriptions); + log.info("Number of subscriptions: " + subscriptions.size()); if (subscriptions.size() > 0) { @@ -96,9 +97,7 @@ public class IndexNotificationsJob { final Encoder nEncoder = Encoders.bean(Notification.class); final Dataset notifications = ClusterUtils .readPath(spark, eventsPath, Event.class) - .map( - (MapFunction) e -> generateNotifications(e, subscriptions, startTime), - ngEncoder) + .map((MapFunction) e -> generateNotifications(e, subscriptions, startTime), ngEncoder) .flatMap((FlatMapFunction) g -> g.getData().iterator(), nEncoder); notifications @@ -108,20 +107,27 @@ public class IndexNotificationsJob { } } - private static NotificationGroup generateNotifications(final Event e, + protected static void initConditionsForSubscriptions(final List subscriptions) { + subscriptions.forEach(s -> conditionsForSubscriptions.put(s.getSubscriptionId(), s.conditionsAsMap())); + } + + protected static NotificationGroup generateNotifications(final Event e, final List subscriptions, final long date) { final List list = subscriptions .stream() - .filter( - s -> StringUtils.isBlank(s.getTopic()) || s.getTopic().equals("*") || s.getTopic().equals(e.getTopic())) - .filter(s -> verifyConditions(e.getMap(), s.conditionsAsMap())) + .filter(s -> StringUtils.isBlank(s.getTopic()) || s.getTopic().equals("*") || s.getTopic().equals(e.getTopic())) + .filter(s -> verifyConditions(e.getMap(), conditionsAsMap(s))) .map(s -> generateNotification(s, e, date)) .collect(Collectors.toList()); return new NotificationGroup(list); } + private static Map> conditionsAsMap(final Subscription s) { + return conditionsForSubscriptions.get(s.getSubscriptionId()); + } + private static Notification generateNotification(final Subscription s, final Event e, final long date) { final Notification n = new Notification(); n.setNotificationId("ntf-" + DigestUtils.md5Hex(s.getSubscriptionId() + "@@@" + e.getEventId())); @@ -145,18 +151,15 @@ public class IndexNotificationsJob { if (conditions.containsKey("trust") && !SubscriptionUtils - .verifyFloatRange( - map.getTrust(), conditions.get("trust").get(0).getValue(), - conditions.get("trust").get(0).getOtherValue())) { + .verifyFloatRange(map.getTrust(), conditions.get("trust").get(0).getValue(), conditions.get("trust").get(0).getOtherValue())) { return false; } if (conditions.containsKey("targetDateofacceptance") && !conditions .get("targetDateofacceptance") .stream() - .anyMatch( - c -> SubscriptionUtils - .verifyDateRange(map.getTargetDateofacceptance(), c.getValue(), c.getOtherValue()))) { + .anyMatch(c -> SubscriptionUtils + .verifyDateRange(map.getTargetDateofacceptance(), c.getValue(), c.getOtherValue()))) { return false; } diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJobTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJobTest.java new file mode 100644 index 000000000..ea5dbd5d1 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJobTest.java @@ -0,0 +1,84 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.Arrays; +import java.util.List; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import eu.dnetlib.dhp.broker.model.Event; +import eu.dnetlib.dhp.broker.model.MappedFields; +import eu.dnetlib.dhp.broker.model.Subscription; +import eu.dnetlib.dhp.broker.oa.util.NotificationGroup; + +class IndexNotificationsJobTest { + + private List subscriptions; + + @BeforeEach + void setUp() throws Exception { + final Subscription s = new Subscription(); + s.setTopic("ENRICH/MISSING/PID"); + s.setConditions("[{\"field\":\"targetDatasourceName\",\"fieldType\":\"STRING\",\"operator\":\"EXACT\",\"listParams\":[{\"value\":\"reposiTUm\"}]},{\"field\":\"trust\",\"fieldType\":\"FLOAT\",\"operator\":\"RANGE\",\"listParams\":[{\"value\":\"0\",\"otherValue\":\"1\"}]}]"); + subscriptions = Arrays.asList(s); + IndexNotificationsJob.initConditionsForSubscriptions(subscriptions); + } + + @Test + void testGenerateNotifications_invalid_topic() { + final Event event = new Event(); + event.setTopic("ENRICH/MISSING/PROJECT"); + + for (int i = 0; i < 10; i++) { + final long start = System.currentTimeMillis(); + final NotificationGroup res = IndexNotificationsJob.generateNotifications(event, subscriptions, 0); + final long end = System.currentTimeMillis(); + + System.out.println("no topic - execution time (ms): " + (end - start)); + + assertEquals(0, res.getData().size()); + } + } + + @Test + void testGenerateNotifications_topic_match() { + final Event event = new Event(); + event.setTopic("ENRICH/MISSING/PID"); + event.setMap(new MappedFields()); + event.getMap().setTargetDatasourceName("reposiTUm"); + event.getMap().setTrust(0.8f); + + for (int i = 0; i < 10; i++) { + final long start = System.currentTimeMillis(); + final NotificationGroup res = IndexNotificationsJob.generateNotifications(event, subscriptions, 0); + final long end = System.currentTimeMillis(); + + System.out.println("topic match - execution time (ms): " + (end - start)); + + assertEquals(1, res.getData().size()); + } + } + + @Test + void testGenerateNotifications_topic_no_match() { + final Event event = new Event(); + event.setTopic("ENRICH/MISSING/PID"); + event.setMap(new MappedFields()); + event.getMap().setTargetDatasourceName("Puma"); + event.getMap().setTrust(0.8f); + + for (int i = 0; i < 10; i++) { + final long start = System.currentTimeMillis(); + final NotificationGroup res = IndexNotificationsJob.generateNotifications(event, subscriptions, 0); + final long end = System.currentTimeMillis(); + + System.out.println("topic no match - execution time (ms): " + (end - start)); + + assertEquals(0, res.getData().size()); + } + } + +} From 0a9ef34b56cdc1fb9783a0efb84dab29b03ebcbb Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Mon, 4 Oct 2021 15:46:12 +0200 Subject: [PATCH 06/51] test --- .../broker/oa/IndexNotificationsJobTest.java | 84 +++++++++++++------ 1 file changed, 60 insertions(+), 24 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJobTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJobTest.java index ea5dbd5d1..6010da2c3 100644 --- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJobTest.java +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJobTest.java @@ -18,6 +18,8 @@ class IndexNotificationsJobTest { private List subscriptions; + private static final int N_TIMES = 1_000_000; + @BeforeEach void setUp() throws Exception { final Subscription s = new Subscription(); @@ -32,15 +34,8 @@ class IndexNotificationsJobTest { final Event event = new Event(); event.setTopic("ENRICH/MISSING/PROJECT"); - for (int i = 0; i < 10; i++) { - final long start = System.currentTimeMillis(); - final NotificationGroup res = IndexNotificationsJob.generateNotifications(event, subscriptions, 0); - final long end = System.currentTimeMillis(); - - System.out.println("no topic - execution time (ms): " + (end - start)); - - assertEquals(0, res.getData().size()); - } + final NotificationGroup res = IndexNotificationsJob.generateNotifications(event, subscriptions, 0); + assertEquals(0, res.getData().size()); } @Test @@ -51,15 +46,8 @@ class IndexNotificationsJobTest { event.getMap().setTargetDatasourceName("reposiTUm"); event.getMap().setTrust(0.8f); - for (int i = 0; i < 10; i++) { - final long start = System.currentTimeMillis(); - final NotificationGroup res = IndexNotificationsJob.generateNotifications(event, subscriptions, 0); - final long end = System.currentTimeMillis(); - - System.out.println("topic match - execution time (ms): " + (end - start)); - - assertEquals(1, res.getData().size()); - } + final NotificationGroup res = IndexNotificationsJob.generateNotifications(event, subscriptions, 0); + assertEquals(1, res.getData().size()); } @Test @@ -70,15 +58,63 @@ class IndexNotificationsJobTest { event.getMap().setTargetDatasourceName("Puma"); event.getMap().setTrust(0.8f); - for (int i = 0; i < 10; i++) { - final long start = System.currentTimeMillis(); - final NotificationGroup res = IndexNotificationsJob.generateNotifications(event, subscriptions, 0); - final long end = System.currentTimeMillis(); + final NotificationGroup res = IndexNotificationsJob.generateNotifications(event, subscriptions, 0); + assertEquals(0, res.getData().size()); + } - System.out.println("topic no match - execution time (ms): " + (end - start)); + @Test + void testGenerateNotifications_invalid_topic_repeated() { + final Event event = new Event(); + event.setTopic("ENRICH/MISSING/PROJECT"); - assertEquals(0, res.getData().size()); + // warm up + IndexNotificationsJob.generateNotifications(event, subscriptions, 0); + + final long start = System.currentTimeMillis(); + for (int i = 0; i < N_TIMES; i++) { + IndexNotificationsJob.generateNotifications(event, subscriptions, 0); } + final long end = System.currentTimeMillis(); + System.out.println(String.format("no topic - repeated %s times - execution time: %s ms ", N_TIMES, end - start)); + + } + + @Test + void testGenerateNotifications_topic_match_repeated() { + final Event event = new Event(); + event.setTopic("ENRICH/MISSING/PID"); + event.setMap(new MappedFields()); + event.getMap().setTargetDatasourceName("reposiTUm"); + event.getMap().setTrust(0.8f); + + // warm up + IndexNotificationsJob.generateNotifications(event, subscriptions, 0); + + final long start = System.currentTimeMillis(); + for (int i = 0; i < N_TIMES; i++) { + IndexNotificationsJob.generateNotifications(event, subscriptions, 0); + } + final long end = System.currentTimeMillis(); + System.out.println(String.format("topic match - repeated %s times - execution time: %s ms ", N_TIMES, end - start)); + } + + @Test + void testGenerateNotifications_topic_no_match_repeated() { + final Event event = new Event(); + event.setTopic("ENRICH/MISSING/PID"); + event.setMap(new MappedFields()); + event.getMap().setTargetDatasourceName("Puma"); + event.getMap().setTrust(0.8f); + + // warm up + IndexNotificationsJob.generateNotifications(event, subscriptions, 0); + + final long start = System.currentTimeMillis(); + for (int i = 0; i < N_TIMES; i++) { + IndexNotificationsJob.generateNotifications(event, subscriptions, 0); + } + final long end = System.currentTimeMillis(); + System.out.println(String.format("topic no match - repeated %s times - execution time: %s ms ", N_TIMES, end - start)); } } From 8bbaa173354e99610053c5adfefd3f1f5c78f43f Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 5 Oct 2021 09:20:37 +0200 Subject: [PATCH 07/51] reimplemented of conditions cache as a non static variable --- .../dhp/broker/oa/IndexNotificationsJob.java | 42 +++++++++++-------- .../broker/oa/IndexNotificationsJobTest.java | 41 +++++++++++------- 2 files changed, 51 insertions(+), 32 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java index c66e5f4f4..5dc3315c4 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java @@ -44,14 +44,13 @@ public class IndexNotificationsJob { private static final Logger log = LoggerFactory.getLogger(IndexNotificationsJob.class); - private static Map>> conditionsForSubscriptions = new HashMap<>(); - public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils - .toString(IndexNotificationsJob.class - .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/index_notifications.json"))); + .toString( + IndexNotificationsJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/index_notifications.json"))); parser.parseArgument(args); final SparkConf conf = new SparkConf(); @@ -88,16 +87,19 @@ public class IndexNotificationsJob { final List subscriptions = listSubscriptions(brokerApiBaseUrl); - initConditionsForSubscriptions(subscriptions); - log.info("Number of subscriptions: " + subscriptions.size()); if (subscriptions.size() > 0) { + final Map>> conditionsMap = prepareConditionsMap(subscriptions); + final Encoder ngEncoder = Encoders.bean(NotificationGroup.class); final Encoder nEncoder = Encoders.bean(Notification.class); final Dataset notifications = ClusterUtils .readPath(spark, eventsPath, Event.class) - .map((MapFunction) e -> generateNotifications(e, subscriptions, startTime), ngEncoder) + .map( + (MapFunction) e -> generateNotifications( + e, subscriptions, conditionsMap, startTime), + ngEncoder) .flatMap((FlatMapFunction) g -> g.getData().iterator(), nEncoder); notifications @@ -107,27 +109,28 @@ public class IndexNotificationsJob { } } - protected static void initConditionsForSubscriptions(final List subscriptions) { - subscriptions.forEach(s -> conditionsForSubscriptions.put(s.getSubscriptionId(), s.conditionsAsMap())); + protected static Map>> prepareConditionsMap( + final List subscriptions) { + final Map>> map = new HashMap<>(); + subscriptions.forEach(s -> map.put(s.getSubscriptionId(), s.conditionsAsMap())); + return map; } protected static NotificationGroup generateNotifications(final Event e, final List subscriptions, + final Map>> conditionsMap, final long date) { final List list = subscriptions .stream() - .filter(s -> StringUtils.isBlank(s.getTopic()) || s.getTopic().equals("*") || s.getTopic().equals(e.getTopic())) - .filter(s -> verifyConditions(e.getMap(), conditionsAsMap(s))) + .filter( + s -> StringUtils.isBlank(s.getTopic()) || s.getTopic().equals("*") || s.getTopic().equals(e.getTopic())) + .filter(s -> verifyConditions(e.getMap(), conditionsMap.get(s.getSubscriptionId()))) .map(s -> generateNotification(s, e, date)) .collect(Collectors.toList()); return new NotificationGroup(list); } - private static Map> conditionsAsMap(final Subscription s) { - return conditionsForSubscriptions.get(s.getSubscriptionId()); - } - private static Notification generateNotification(final Subscription s, final Event e, final long date) { final Notification n = new Notification(); n.setNotificationId("ntf-" + DigestUtils.md5Hex(s.getSubscriptionId() + "@@@" + e.getEventId())); @@ -151,15 +154,18 @@ public class IndexNotificationsJob { if (conditions.containsKey("trust") && !SubscriptionUtils - .verifyFloatRange(map.getTrust(), conditions.get("trust").get(0).getValue(), conditions.get("trust").get(0).getOtherValue())) { + .verifyFloatRange( + map.getTrust(), conditions.get("trust").get(0).getValue(), + conditions.get("trust").get(0).getOtherValue())) { return false; } if (conditions.containsKey("targetDateofacceptance") && !conditions .get("targetDateofacceptance") .stream() - .anyMatch(c -> SubscriptionUtils - .verifyDateRange(map.getTargetDateofacceptance(), c.getValue(), c.getOtherValue()))) { + .anyMatch( + c -> SubscriptionUtils + .verifyDateRange(map.getTargetDateofacceptance(), c.getValue(), c.getOtherValue()))) { return false; } diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJobTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJobTest.java index 6010da2c3..8ecb3061b 100644 --- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJobTest.java +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJobTest.java @@ -5,10 +5,12 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import java.util.Arrays; import java.util.List; +import java.util.Map; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import eu.dnetlib.dhp.broker.model.ConditionParams; import eu.dnetlib.dhp.broker.model.Event; import eu.dnetlib.dhp.broker.model.MappedFields; import eu.dnetlib.dhp.broker.model.Subscription; @@ -18,15 +20,19 @@ class IndexNotificationsJobTest { private List subscriptions; + private Map>> conditionsMap; + private static final int N_TIMES = 1_000_000; @BeforeEach void setUp() throws Exception { final Subscription s = new Subscription(); s.setTopic("ENRICH/MISSING/PID"); - s.setConditions("[{\"field\":\"targetDatasourceName\",\"fieldType\":\"STRING\",\"operator\":\"EXACT\",\"listParams\":[{\"value\":\"reposiTUm\"}]},{\"field\":\"trust\",\"fieldType\":\"FLOAT\",\"operator\":\"RANGE\",\"listParams\":[{\"value\":\"0\",\"otherValue\":\"1\"}]}]"); + s + .setConditions( + "[{\"field\":\"targetDatasourceName\",\"fieldType\":\"STRING\",\"operator\":\"EXACT\",\"listParams\":[{\"value\":\"reposiTUm\"}]},{\"field\":\"trust\",\"fieldType\":\"FLOAT\",\"operator\":\"RANGE\",\"listParams\":[{\"value\":\"0\",\"otherValue\":\"1\"}]}]"); subscriptions = Arrays.asList(s); - IndexNotificationsJob.initConditionsForSubscriptions(subscriptions); + conditionsMap = IndexNotificationsJob.prepareConditionsMap(subscriptions); } @Test @@ -34,7 +40,8 @@ class IndexNotificationsJobTest { final Event event = new Event(); event.setTopic("ENRICH/MISSING/PROJECT"); - final NotificationGroup res = IndexNotificationsJob.generateNotifications(event, subscriptions, 0); + final NotificationGroup res = IndexNotificationsJob + .generateNotifications(event, subscriptions, conditionsMap, 0); assertEquals(0, res.getData().size()); } @@ -46,7 +53,8 @@ class IndexNotificationsJobTest { event.getMap().setTargetDatasourceName("reposiTUm"); event.getMap().setTrust(0.8f); - final NotificationGroup res = IndexNotificationsJob.generateNotifications(event, subscriptions, 0); + final NotificationGroup res = IndexNotificationsJob + .generateNotifications(event, subscriptions, conditionsMap, 0); assertEquals(1, res.getData().size()); } @@ -58,7 +66,8 @@ class IndexNotificationsJobTest { event.getMap().setTargetDatasourceName("Puma"); event.getMap().setTrust(0.8f); - final NotificationGroup res = IndexNotificationsJob.generateNotifications(event, subscriptions, 0); + final NotificationGroup res = IndexNotificationsJob + .generateNotifications(event, subscriptions, conditionsMap, 0); assertEquals(0, res.getData().size()); } @@ -68,14 +77,15 @@ class IndexNotificationsJobTest { event.setTopic("ENRICH/MISSING/PROJECT"); // warm up - IndexNotificationsJob.generateNotifications(event, subscriptions, 0); + IndexNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0); final long start = System.currentTimeMillis(); for (int i = 0; i < N_TIMES; i++) { - IndexNotificationsJob.generateNotifications(event, subscriptions, 0); + IndexNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0); } final long end = System.currentTimeMillis(); - System.out.println(String.format("no topic - repeated %s times - execution time: %s ms ", N_TIMES, end - start)); + System.out + .println(String.format("no topic - repeated %s times - execution time: %s ms ", N_TIMES, end - start)); } @@ -88,14 +98,15 @@ class IndexNotificationsJobTest { event.getMap().setTrust(0.8f); // warm up - IndexNotificationsJob.generateNotifications(event, subscriptions, 0); + IndexNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0); final long start = System.currentTimeMillis(); for (int i = 0; i < N_TIMES; i++) { - IndexNotificationsJob.generateNotifications(event, subscriptions, 0); + IndexNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0); } final long end = System.currentTimeMillis(); - System.out.println(String.format("topic match - repeated %s times - execution time: %s ms ", N_TIMES, end - start)); + System.out + .println(String.format("topic match - repeated %s times - execution time: %s ms ", N_TIMES, end - start)); } @Test @@ -107,14 +118,16 @@ class IndexNotificationsJobTest { event.getMap().setTrust(0.8f); // warm up - IndexNotificationsJob.generateNotifications(event, subscriptions, 0); + IndexNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0); final long start = System.currentTimeMillis(); for (int i = 0; i < N_TIMES; i++) { - IndexNotificationsJob.generateNotifications(event, subscriptions, 0); + IndexNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0); } final long end = System.currentTimeMillis(); - System.out.println(String.format("topic no match - repeated %s times - execution time: %s ms ", N_TIMES, end - start)); + System.out + .println( + String.format("topic no match - repeated %s times - execution time: %s ms ", N_TIMES, end - start)); } } From 69008e20c2e1c4e6af8f45c8ad4acd17bdf54fe7 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 5 Oct 2021 11:58:20 +0200 Subject: [PATCH 08/51] log and tests --- .../dhp/broker/oa/IndexNotificationsJob.java | 2 + .../oa/samples/SimpleVariableJobTest.java | 132 ++++++++++++++++++ 2 files changed, 134 insertions(+) create mode 100644 dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/samples/SimpleVariableJobTest.java diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java index 5dc3315c4..fb2e26ba2 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java @@ -92,6 +92,8 @@ public class IndexNotificationsJob { if (subscriptions.size() > 0) { final Map>> conditionsMap = prepareConditionsMap(subscriptions); + log.info("ConditionsMap: " + new ObjectMapper().writeValueAsString(conditionsMap)); + final Encoder ngEncoder = Encoders.bean(NotificationGroup.class); final Encoder nEncoder = Encoders.bean(Notification.class); final Dataset notifications = ClusterUtils diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/samples/SimpleVariableJobTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/samples/SimpleVariableJobTest.java new file mode 100644 index 000000000..a6d1c89d3 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/samples/SimpleVariableJobTest.java @@ -0,0 +1,132 @@ + +package eu.dnetlib.dhp.broker.oa.samples; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.commons.io.FileUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.core.JsonParseException; +import com.fasterxml.jackson.databind.JsonMappingException; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.broker.model.ConditionParams; +import eu.dnetlib.dhp.broker.model.MapCondition; +import eu.dnetlib.dhp.broker.oa.util.SubscriptionUtils; + +@Disabled +public class SimpleVariableJobTest { + + private static final Logger log = LoggerFactory.getLogger(SimpleVariableJobTest.class); + + private static Path workingDir; + + private static SparkSession spark; + + private final static List inputList = new ArrayList<>(); + + private static final Map>> staticMap = new HashMap<>(); + + @BeforeAll + public static void beforeAll() throws IOException { + + workingDir = Files.createTempDirectory(SimpleVariableJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + final SparkConf conf = new SparkConf(); + conf.setAppName(SimpleVariableJobTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + // conf.set("spark.sql.warehouse.dir", workingDir.toString()); + // conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + spark = SparkSession + .builder() + .appName(SimpleVariableJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + + for (int i = 0; i < 1_000_000; i++) { + inputList.add("record " + i); + } + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + @Test + public void testSimpleVariableJob() throws Exception { + final Map>> map = fillMap(); + + final long n = spark + .createDataset(inputList, Encoders.STRING()) + .filter(s -> filter(map.get(s))) + .map((MapFunction) s -> s.toLowerCase(), Encoders.STRING()) + .count(); + + System.out.println(n); + } + + @Test + public void testSimpleVariableJob_static() throws Exception { + + staticMap.putAll(fillMap()); + + final long n = spark + .createDataset(inputList, Encoders.STRING()) + .filter(s -> filter(staticMap.get(s))) + .map((MapFunction) s -> s.toLowerCase(), Encoders.STRING()) + .count(); + + System.out.println(n); + } + + private static Map>> fillMap() + throws JsonParseException, JsonMappingException, IOException { + final String s = "[{\"field\":\"targetDatasourceName\",\"fieldType\":\"STRING\",\"operator\":\"EXACT\",\"listParams\":[{\"value\":\"reposiTUm\"}]},{\"field\":\"trust\",\"fieldType\":\"FLOAT\",\"operator\":\"RANGE\",\"listParams\":[{\"value\":\"0\",\"otherValue\":\"1\"}]}]"; + + final ObjectMapper mapper = new ObjectMapper(); + final List list = mapper + .readValue(s, mapper.getTypeFactory().constructCollectionType(List.class, MapCondition.class)); + final Map> conditions = list + .stream() + .filter(mc -> !mc.getListParams().isEmpty()) + .collect(Collectors.toMap(MapCondition::getField, MapCondition::getListParams)); + + final Map>> map = new HashMap<>(); + inputList.forEach(i -> map.put(i, conditions)); + return map; + } + + private static boolean filter(final Map> conditions) { + if (conditions.containsKey("targetDatasourceName") + && !SubscriptionUtils + .verifyExact("reposiTUm", conditions.get("targetDatasourceName").get(0).getValue())) { + return false; + } + return true; + } + +} From 210d6c0e6d002d76ada343ad4f526cc60e70b41d Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 5 Oct 2021 13:57:46 +0200 Subject: [PATCH 09/51] generateNotificationsJob and indexNotificationsJob --- .../broker/oa/GenerateNotificationsJob.java | 184 ++++++++++++++++++ .../dhp/broker/oa/IndexNotificationsJob.java | 168 +++------------- .../oa/generate_all/oozie_app/workflow.xml | 24 +++ .../dhp/broker/oa/generate_notifications.json | 14 ++ .../notifications_only/oozie_app/workflow.xml | 27 ++- 5 files changed, 279 insertions(+), 138 deletions(-) create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateNotificationsJob.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_notifications.json diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateNotificationsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateNotificationsJob.java new file mode 100644 index 000000000..21c6c64a6 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateNotificationsJob.java @@ -0,0 +1,184 @@ + +package eu.dnetlib.dhp.broker.oa; + +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.util.LongAccumulator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.model.ConditionParams; +import eu.dnetlib.dhp.broker.model.Event; +import eu.dnetlib.dhp.broker.model.MappedFields; +import eu.dnetlib.dhp.broker.model.Notification; +import eu.dnetlib.dhp.broker.model.Subscription; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.NotificationGroup; +import eu.dnetlib.dhp.broker.oa.util.SubscriptionUtils; + +public class GenerateNotificationsJob { + + private static final Logger log = LoggerFactory.getLogger(GenerateNotificationsJob.class); + + public static void main(final String[] args) throws Exception { + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString(GenerateNotificationsJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_notifications.json"))); + parser.parseArgument(args); + + final SparkConf conf = new SparkConf(); + + final String eventsPath = parser.get("outputDir") + "/events"; + log.info("eventsPath: {}", eventsPath); + + final String notificationsPath = parser.get("outputDir") + "/notifications"; + log.info("notificationsPath: {}", notificationsPath); + + final String brokerApiBaseUrl = parser.get("brokerApiBaseUrl"); + log.info("brokerApiBaseUrl: {}", brokerApiBaseUrl); + + final SparkSession spark = SparkSession.builder().config(conf).getOrCreate(); + + final LongAccumulator total = spark.sparkContext().longAccumulator("total_notifications"); + + final long startTime = new Date().getTime(); + + final List subscriptions = listSubscriptions(brokerApiBaseUrl); + + log.info("Number of subscriptions: " + subscriptions.size()); + + if (subscriptions.size() > 0) { + final Map>> conditionsMap = prepareConditionsMap(subscriptions); + + log.info("ConditionsMap: " + new ObjectMapper().writeValueAsString(conditionsMap)); + + final Encoder ngEncoder = Encoders.bean(NotificationGroup.class); + final Encoder nEncoder = Encoders.bean(Notification.class); + final Dataset notifications = ClusterUtils + .readPath(spark, eventsPath, Event.class) + .map((MapFunction) e -> generateNotifications(e, subscriptions, conditionsMap, startTime), ngEncoder) + .flatMap((FlatMapFunction) g -> g.getData().iterator(), nEncoder); + + ClusterUtils.save(notifications, notificationsPath, Notification.class, total); + } + } + + protected static Map>> prepareConditionsMap( + final List subscriptions) { + final Map>> map = new HashMap<>(); + subscriptions.forEach(s -> map.put(s.getSubscriptionId(), s.conditionsAsMap())); + return map; + } + + protected static NotificationGroup generateNotifications(final Event e, + final List subscriptions, + final Map>> conditionsMap, + final long date) { + final List list = subscriptions + .stream() + .filter(s -> StringUtils.isBlank(s.getTopic()) || s.getTopic().equals("*") || s.getTopic().equals(e.getTopic())) + .filter(s -> verifyConditions(e.getMap(), conditionsMap.get(s.getSubscriptionId()))) + .map(s -> generateNotification(s, e, date)) + .collect(Collectors.toList()); + + return new NotificationGroup(list); + } + + private static Notification generateNotification(final Subscription s, final Event e, final long date) { + final Notification n = new Notification(); + n.setNotificationId("ntf-" + DigestUtils.md5Hex(s.getSubscriptionId() + "@@@" + e.getEventId())); + n.setSubscriptionId(s.getSubscriptionId()); + n.setEventId(e.getEventId()); + n.setProducerId(e.getProducerId()); + n.setTopic(e.getTopic()); + n.setPayload(e.getPayload()); + n.setMap(e.getMap()); + n.setDate(date); + return n; + } + + private static boolean verifyConditions(final MappedFields map, + final Map> conditions) { + if (conditions.containsKey("targetDatasourceName") + && !SubscriptionUtils + .verifyExact(map.getTargetDatasourceName(), conditions.get("targetDatasourceName").get(0).getValue())) { + return false; + } + + if (conditions.containsKey("trust") + && !SubscriptionUtils + .verifyFloatRange(map.getTrust(), conditions.get("trust").get(0).getValue(), conditions.get("trust").get(0).getOtherValue())) { + return false; + } + + if (conditions.containsKey("targetDateofacceptance") && !conditions + .get("targetDateofacceptance") + .stream() + .anyMatch(c -> SubscriptionUtils + .verifyDateRange(map.getTargetDateofacceptance(), c.getValue(), c.getOtherValue()))) { + return false; + } + + if (conditions.containsKey("targetResultTitle") + && !conditions + .get("targetResultTitle") + .stream() + .anyMatch(c -> SubscriptionUtils.verifySimilar(map.getTargetResultTitle(), c.getValue()))) { + return false; + } + + if (conditions.containsKey("targetAuthors") + && !conditions + .get("targetAuthors") + .stream() + .allMatch(c -> SubscriptionUtils.verifyListSimilar(map.getTargetAuthors(), c.getValue()))) { + return false; + } + + return !conditions.containsKey("targetSubjects") + || conditions + .get("targetSubjects") + .stream() + .allMatch(c -> SubscriptionUtils.verifyListExact(map.getTargetSubjects(), c.getValue())); + + } + + private static List listSubscriptions(final String brokerApiBaseUrl) throws Exception { + final String url = brokerApiBaseUrl + "/api/subscriptions"; + final HttpGet req = new HttpGet(url); + + final ObjectMapper mapper = new ObjectMapper(); + + try (final CloseableHttpClient client = HttpClients.createDefault()) { + try (final CloseableHttpResponse response = client.execute(req)) { + final String s = IOUtils.toString(response.getEntity().getContent()); + return mapper + .readValue(s, mapper.getTypeFactory().constructCollectionType(List.class, Subscription.class)); + } + } + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java index fb2e26ba2..55e82446f 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java @@ -2,28 +2,22 @@ package eu.dnetlib.dhp.broker.oa; import java.io.IOException; -import java.util.Date; import java.util.HashMap; -import java.util.List; import java.util.Map; -import java.util.stream.Collectors; -import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpDelete; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import org.apache.spark.util.LongAccumulator; +import org.elasticsearch.spark.rdd.api.java.JavaEsSpark; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -31,14 +25,8 @@ import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.broker.model.ConditionParams; -import eu.dnetlib.dhp.broker.model.Event; -import eu.dnetlib.dhp.broker.model.MappedFields; import eu.dnetlib.dhp.broker.model.Notification; -import eu.dnetlib.dhp.broker.model.Subscription; import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; -import eu.dnetlib.dhp.broker.oa.util.NotificationGroup; -import eu.dnetlib.dhp.broker.oa.util.SubscriptionUtils; public class IndexNotificationsJob { @@ -48,15 +36,14 @@ public class IndexNotificationsJob { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils - .toString( - IndexNotificationsJob.class - .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/index_notifications.json"))); + .toString(IndexNotificationsJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/index_notifications.json"))); parser.parseArgument(args); final SparkConf conf = new SparkConf(); - final String eventsPath = parser.get("outputDir") + "/events"; - log.info("eventsPath: {}", eventsPath); + final String notificationsPath = parser.get("outputDir") + "/notifications"; + log.info("notificationsPath: {}", notificationsPath); final String index = parser.get("index"); log.info("index: {}", index); @@ -83,131 +70,38 @@ public class IndexNotificationsJob { final LongAccumulator total = spark.sparkContext().longAccumulator("total_indexed"); - final long startTime = new Date().getTime(); + final Long date = ClusterUtils + .readPath(spark, notificationsPath, Notification.class) + .first() + .getDate(); - final List subscriptions = listSubscriptions(brokerApiBaseUrl); + final JavaRDD toIndexRdd = ClusterUtils + .readPath(spark, notificationsPath, Notification.class) + .map((MapFunction) n -> prepareForIndexing(n, total), Encoders.STRING()) + .javaRDD(); - log.info("Number of subscriptions: " + subscriptions.size()); + final Map esCfg = new HashMap<>(); - if (subscriptions.size() > 0) { - final Map>> conditionsMap = prepareConditionsMap(subscriptions); + esCfg.put("es.index.auto.create", "false"); + esCfg.put("es.nodes", indexHost); + esCfg.put("es.mapping.id", "notificationId"); // THE PRIMARY KEY + esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount); + esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait); + esCfg.put("es.batch.size.entries", esBatchSizeEntries); + esCfg.put("es.nodes.wan.only", esNodesWanOnly); - log.info("ConditionsMap: " + new ObjectMapper().writeValueAsString(conditionsMap)); + log.info("*** Start indexing"); + JavaEsSpark.saveJsonToEs(toIndexRdd, index, esCfg); + log.info("*** End indexing"); - final Encoder ngEncoder = Encoders.bean(NotificationGroup.class); - final Encoder nEncoder = Encoders.bean(Notification.class); - final Dataset notifications = ClusterUtils - .readPath(spark, eventsPath, Event.class) - .map( - (MapFunction) e -> generateNotifications( - e, subscriptions, conditionsMap, startTime), - ngEncoder) - .flatMap((FlatMapFunction) g -> g.getData().iterator(), nEncoder); + log.info("*** Deleting old notifications"); + final String message = deleteOldNotifications(brokerApiBaseUrl, date - 1000); + log.info("*** Deleted notifications: {}", message); - notifications - .map((MapFunction) n -> prepareForIndexing(n, total), Encoders.STRING()) - .javaRDD() - .saveAsTextFile("/tmp/IndexNotificationsJob_test_6504"); - } - } + log.info("*** sendNotifications (emails, ...)"); + sendNotifications(brokerApiBaseUrl, date - 1000); + log.info("*** ALL done."); - protected static Map>> prepareConditionsMap( - final List subscriptions) { - final Map>> map = new HashMap<>(); - subscriptions.forEach(s -> map.put(s.getSubscriptionId(), s.conditionsAsMap())); - return map; - } - - protected static NotificationGroup generateNotifications(final Event e, - final List subscriptions, - final Map>> conditionsMap, - final long date) { - final List list = subscriptions - .stream() - .filter( - s -> StringUtils.isBlank(s.getTopic()) || s.getTopic().equals("*") || s.getTopic().equals(e.getTopic())) - .filter(s -> verifyConditions(e.getMap(), conditionsMap.get(s.getSubscriptionId()))) - .map(s -> generateNotification(s, e, date)) - .collect(Collectors.toList()); - - return new NotificationGroup(list); - } - - private static Notification generateNotification(final Subscription s, final Event e, final long date) { - final Notification n = new Notification(); - n.setNotificationId("ntf-" + DigestUtils.md5Hex(s.getSubscriptionId() + "@@@" + e.getEventId())); - n.setSubscriptionId(s.getSubscriptionId()); - n.setEventId(e.getEventId()); - n.setProducerId(e.getProducerId()); - n.setTopic(e.getTopic()); - n.setPayload(e.getPayload()); - n.setMap(e.getMap()); - n.setDate(date); - return n; - } - - private static boolean verifyConditions(final MappedFields map, - final Map> conditions) { - if (conditions.containsKey("targetDatasourceName") - && !SubscriptionUtils - .verifyExact(map.getTargetDatasourceName(), conditions.get("targetDatasourceName").get(0).getValue())) { - return false; - } - - if (conditions.containsKey("trust") - && !SubscriptionUtils - .verifyFloatRange( - map.getTrust(), conditions.get("trust").get(0).getValue(), - conditions.get("trust").get(0).getOtherValue())) { - return false; - } - - if (conditions.containsKey("targetDateofacceptance") && !conditions - .get("targetDateofacceptance") - .stream() - .anyMatch( - c -> SubscriptionUtils - .verifyDateRange(map.getTargetDateofacceptance(), c.getValue(), c.getOtherValue()))) { - return false; - } - - if (conditions.containsKey("targetResultTitle") - && !conditions - .get("targetResultTitle") - .stream() - .anyMatch(c -> SubscriptionUtils.verifySimilar(map.getTargetResultTitle(), c.getValue()))) { - return false; - } - - if (conditions.containsKey("targetAuthors") - && !conditions - .get("targetAuthors") - .stream() - .allMatch(c -> SubscriptionUtils.verifyListSimilar(map.getTargetAuthors(), c.getValue()))) { - return false; - } - - return !conditions.containsKey("targetSubjects") - || conditions - .get("targetSubjects") - .stream() - .allMatch(c -> SubscriptionUtils.verifyListExact(map.getTargetSubjects(), c.getValue())); - - } - - private static List listSubscriptions(final String brokerApiBaseUrl) throws Exception { - final String url = brokerApiBaseUrl + "/api/subscriptions"; - final HttpGet req = new HttpGet(url); - - final ObjectMapper mapper = new ObjectMapper(); - - try (final CloseableHttpClient client = HttpClients.createDefault()) { - try (final CloseableHttpResponse response = client.execute(req)) { - final String s = IOUtils.toString(response.getEntity().getContent()); - return mapper - .readValue(s, mapper.getTypeFactory().constructCollectionType(List.class, Subscription.class)); - } - } } private static String deleteOldNotifications(final String brokerApiBaseUrl, final long l) throws Exception { diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml index ea80c3acf..1a98a2513 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml @@ -542,6 +542,30 @@ --dbPassword${brokerDbPassword} --brokerApiBaseUrl${brokerApiBaseUrl} + + + + + + + yarn + cluster + GenerateNotificationsJob + eu.dnetlib.dhp.broker.oa.GenerateNotificationsJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --outputDir${outputDir} + --brokerApiBaseUrl${brokerApiBaseUrl} + diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_notifications.json b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_notifications.json new file mode 100644 index 000000000..6e12783b9 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_notifications.json @@ -0,0 +1,14 @@ +[ + { + "paramName": "o", + "paramLongName": "outputDir", + "paramDescription": "the dir that contains the events folder", + "paramRequired": true + }, + { + "paramName": "broker", + "paramLongName": "brokerApiBaseUrl", + "paramDescription": "the url of the broker service api", + "paramRequired": true + } +] diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml index 70ab7d344..d2d7b6d11 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml @@ -119,12 +119,36 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + yarn + cluster + GenerateNotificationsJob + eu.dnetlib.dhp.broker.oa.GenerateNotificationsJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --outputDir${outputDir} + --brokerApiBaseUrl${brokerApiBaseUrl} + + + + + yarn @@ -135,6 +159,7 @@ --executor-memory=${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} + --conf spark.dynamicAllocation.maxExecutors="8" --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} From d6e1f224086380de758d6be2854a13b6fe8809b4 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 5 Oct 2021 15:09:18 +0200 Subject: [PATCH 10/51] max numbers of workers for indexing --- .../dhp/broker/oa/generate_all/oozie_app/workflow.xml | 9 +++++++-- .../broker/oa/notifications_only/oozie_app/workflow.xml | 7 ++++++- .../dnetlib/dhp/broker/oa/reindex/oozie_app/workflow.xml | 7 ++++++- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml index 1a98a2513..bc6778f52 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml @@ -115,6 +115,11 @@ spark2EventLogDir spark 2.* event log dir location + + sparkMaxExecutorsForIndexing + 8 + Max number of workers for ElasticSearch indexing + @@ -498,7 +503,7 @@ --executor-memory=${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} - --conf spark.dynamicAllocation.maxExecutors="8" + --conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} @@ -580,7 +585,7 @@ --executor-memory=${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} - --conf spark.dynamicAllocation.maxExecutors="8" + --conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml index d2d7b6d11..0d226d78e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml @@ -98,6 +98,11 @@ spark2EventLogDir spark 2.* event log dir location + + sparkMaxExecutorsForIndexing + 8 + Max number of workers for ElasticSearch indexing + @@ -159,7 +164,7 @@ --executor-memory=${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} - --conf spark.dynamicAllocation.maxExecutors="8" + --conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/reindex/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/reindex/oozie_app/workflow.xml index 9095004ad..87adfffaa 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/reindex/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/reindex/oozie_app/workflow.xml @@ -75,6 +75,11 @@ spark2EventLogDir spark 2.* event log dir location + + sparkMaxExecutorsForIndexing + 8 + Max number of workers for ElasticSearch indexing + @@ -112,7 +117,7 @@ --executor-memory=${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} - --conf spark.dynamicAllocation.maxExecutors="8" + --conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} From 2f61054cd198ce563fba2149b23c53aafbaf7029 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 11 Oct 2021 18:29:42 +0200 Subject: [PATCH 11/51] code formatting --- .../main/java/eu/dnetlib/dhp/PropagationConstant.java | 11 ++++++++--- .../SparkOrcidToResultFromSemRelJob.java | 7 +++++-- .../SparkResultToCommunityFromOrganizationJob.java | 4 ++-- .../SparkResultToCommunityThroughSemRelJob.java | 4 ++-- .../dhp/oa/provision/IndexRecordTransformerTest.java | 11 ++++++----- 5 files changed, 23 insertions(+), 14 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java index 0d7c74475..23e97a97a 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java @@ -69,7 +69,7 @@ public class PropagationConstant { PROPAGATION_DATA_INFO_TYPE, PROPAGATION_COUNTRY_INSTREPO_CLASS_ID, PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME, - ModelConstants.DNET_PROVENANCE_ACTIONS)); + ModelConstants.DNET_PROVENANCE_ACTIONS)); return nc; } @@ -84,7 +84,8 @@ public class PropagationConstant { return di; } - public static Qualifier getQualifier(String inference_class_id, String inference_class_name, String qualifierSchema) { + public static Qualifier getQualifier(String inference_class_id, String inference_class_name, + String qualifierSchema) { Qualifier pa = new Qualifier(); pa.setClassid(inference_class_id); pa.setClassname(inference_class_name); @@ -108,7 +109,11 @@ public class PropagationConstant { r.setRelClass(rel_class); r.setRelType(rel_type); r.setSubRelType(subrel_type); - r.setDataInfo(getDataInfo(inference_provenance, inference_class_id, inference_class_name, ModelConstants.DNET_PROVENANCE_ACTIONS)); + r + .setDataInfo( + getDataInfo( + inference_provenance, inference_class_id, inference_class_name, + ModelConstants.DNET_PROVENANCE_ACTIONS)); return r; } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java index 68949b900..a38b4da2e 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java @@ -173,14 +173,17 @@ public class SparkOrcidToResultFromSemRelJob { if (toaddpid) { StructuredProperty p = new StructuredProperty(); p.setValue(autoritative_author.getOrcid()); - p.setQualifier(getQualifier(ModelConstants.ORCID_PENDING, ModelConstants.ORCID_CLASSNAME, ModelConstants.DNET_PID_TYPES)); + p + .setQualifier( + getQualifier( + ModelConstants.ORCID_PENDING, ModelConstants.ORCID_CLASSNAME, ModelConstants.DNET_PID_TYPES)); p .setDataInfo( getDataInfo( PROPAGATION_DATA_INFO_TYPE, PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID, PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME, - ModelConstants.DNET_PROVENANCE_ACTIONS)); + ModelConstants.DNET_PROVENANCE_ACTIONS)); Optional> authorPid = Optional.ofNullable(author.getPid()); if (authorPid.isPresent()) { diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java index 1289ff644..50df08f8c 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java @@ -10,7 +10,6 @@ import java.util.List; import java.util.Optional; import java.util.stream.Collectors; -import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -22,6 +21,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Context; import eu.dnetlib.dhp.schema.oaf.Result; import scala.Tuple2; @@ -130,7 +130,7 @@ public class SparkResultToCommunityFromOrganizationJob { PROPAGATION_DATA_INFO_TYPE, PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_ID, PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_NAME, - ModelConstants.DNET_PROVENANCE_ACTIONS))); + ModelConstants.DNET_PROVENANCE_ACTIONS))); propagatedContexts.add(newContext); } } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java index 7f76ead94..f31a26230 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java @@ -7,7 +7,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.util.*; import java.util.stream.Collectors; -import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -20,6 +19,7 @@ import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; @@ -126,7 +126,7 @@ public class SparkResultToCommunityThroughSemRelJob { PROPAGATION_DATA_INFO_TYPE, PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID, PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME, - ModelConstants.DNET_PROVENANCE_ACTIONS))); + ModelConstants.DNET_PROVENANCE_ACTIONS))); return newContext; } return null; diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java index 1c7dce3f2..64935e79d 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java @@ -84,13 +84,15 @@ public class IndexRecordTransformerTest { @Test public void testForEOSCFutureTraining() throws IOException, TransformerException { - final String record = IOUtils.toString(getClass().getResourceAsStream("eosc-future/training-notebooks-seadatanet.xml")); + final String record = IOUtils + .toString(getClass().getResourceAsStream("eosc-future/training-notebooks-seadatanet.xml")); testRecordTransformation(record); } @Test public void testForEOSCFutureAirQualityCopernicus() throws IOException, TransformerException { - final String record = IOUtils.toString(getClass().getResourceAsStream("eosc-future/air-quality-copernicus.xml")); + final String record = IOUtils + .toString(getClass().getResourceAsStream("eosc-future/air-quality-copernicus.xml")); testRecordTransformation(record); } @@ -102,12 +104,11 @@ public class IndexRecordTransformerTest { @Test public void testForEOSCFutureB2SharePlotRelatedORP() throws IOException, TransformerException { - final String record = IOUtils.toString(getClass().getResourceAsStream("eosc-future/b2share-plot-related-orp.xml")); + final String record = IOUtils + .toString(getClass().getResourceAsStream("eosc-future/b2share-plot-related-orp.xml")); testRecordTransformation(record); } - - private void testRecordTransformation(final String record) throws IOException, TransformerException { final String fields = IOUtils.toString(getClass().getResourceAsStream("fields.xml")); final String xslt = IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl")); From 8db39c86e2ad41f0feefa21e94d8700690884608 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 13 Oct 2021 11:34:24 +0200 Subject: [PATCH 12/51] added new parameter in the doiboost process workflow to specify a folder for the process of MAG dataset --- .../dnetlib/dhp/doiboost/process/oozie_app/workflow.xml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/workflow.xml index eb82c3a7d..282c06f4e 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/workflow.xml @@ -48,7 +48,11 @@ inputPathMAG - the MAG working path + the MAG input path + + + workingPathMAG + the MAG working path where to store the intermediate process results @@ -138,7 +142,7 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --sourcePath${inputPathMAG}/dataset - --workingPath${inputPathMAG}/process_p + --workingPath${workingPathMAG} --targetPath${workingPath} --masteryarn-cluster From 4b1920f008955e429ac20037fa66cb7676633b8f Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 13 Oct 2021 15:33:50 +0200 Subject: [PATCH 13/51] changed the working path parameter value as dependant from the dnet-workflow working dir parameter --- .../eu/dnetlib/dhp/doiboost/process/oozie_app/workflow.xml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/workflow.xml index 282c06f4e..29a12f4df 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/process/oozie_app/workflow.xml @@ -50,10 +50,6 @@ inputPathMAG the MAG input path - - workingPathMAG - the MAG working path where to store the intermediate process results - @@ -142,7 +138,7 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --sourcePath${inputPathMAG}/dataset - --workingPath${workingPathMAG} + --workingPath${workingPath}/MAG --targetPath${workingPath} --masteryarn-cluster From e15a1969a57726228f487e8b06b467ad5cdbef33 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 14 Oct 2021 14:33:56 +0200 Subject: [PATCH 14/51] applying fix on the DOIBoost construction process that somehow wasn't part of the merge done in 83c90c71809c947adcfc1e70eb9d9fad195d87aa --- .../eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala index e501b4823..c08e09f53 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala @@ -206,9 +206,9 @@ object SparkGenerateDoiBoost { (r.getSource, r) else if (r.getTarget.startsWith("unresolved")) (r.getTarget,r) - else + else ("resolved", r) - }) + })(Encoders.tuple(Encoders.STRING, mapEncoderRel)) val openaireOrganization:Dataset[(String,String)] = spark.read.text(openaireOrganizationPath).as[String].flatMap(s => extractIdGRID(s)).groupByKey(_._2).reduceGroups((x,y) => if (x != null) x else y ).map(_._2) @@ -221,8 +221,8 @@ object SparkGenerateDoiBoost { currentRels.setSource(currentOrgs._1) else currentRels.setTarget(currentOrgs._1) - currentRels - }.write.save(s"$workingDirPath/doiBoostPublicationAffiliation") + currentRels + }.filter(r=> !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved")).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation") magPubs.joinWith(a,magPubs("_1").equalTo(a("PaperId"))).map( item => { val affiliation = item._2 @@ -242,6 +242,6 @@ object SparkGenerateDoiBoost { else null }).filter(o=> o!=null).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostOrganization") - } + } -} +} \ No newline at end of file From e471f12d5e3eebfddc7ece9b804ef998112bd48f Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 19 Oct 2021 12:35:38 +0200 Subject: [PATCH 15/51] hotfix: recovered implementation removing the hardcoded working_dirs --- .../CreateRelatedEntitiesJob_phase1.java | 39 +++++++++---------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java index a33a45517..48e5945c0 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java @@ -52,8 +52,11 @@ public class CreateRelatedEntitiesJob_phase1 { final String jsonConfiguration = IOUtils .toString( - PrepareRelationsJob.class - .getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase1.json")); + Objects + .requireNonNull( + CreateRelatedEntitiesJob_phase1.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase1.json"))); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); @@ -75,6 +78,7 @@ public class CreateRelatedEntitiesJob_phase1 { final String graphTableClassName = parser.get("graphTableClassName"); log.info("graphTableClassName: {}", graphTableClassName); + @SuppressWarnings("unchecked") final Class entityClazz = (Class) Class.forName(graphTableClassName); final SparkConf conf = new SparkConf(); @@ -101,22 +105,12 @@ public class CreateRelatedEntitiesJob_phase1 { Encoders.tuple(Encoders.STRING(), Encoders.kryo(Relation.class))) .cache(); - readPathEntity(spark, inputEntityPath, clazz) + final Dataset> entities = readPathEntity(spark, inputEntityPath, clazz) .filter("dataInfo.invisible == false") .map( (MapFunction>) e -> new Tuple2<>(e.getId(), asRelatedEntity(e, clazz)), - Encoders - .tuple(Encoders.STRING(), Encoders.kryo(RelatedEntity.class))) - .write() - .mode(SaveMode.Overwrite) - .save("/tmp/beta_provision/working_dir/update_solr/join_partial/relatedEntities/" + clazz.getSimpleName()); - - final Dataset> entities = spark - .read() - .load("/tmp/beta_provision/working_dir/update_solr/join_partial/relatedEntities/" + clazz.getSimpleName()) - .as( - Encoders - .tuple(Encoders.STRING(), Encoders.kryo(RelatedEntity.class))); + Encoders.tuple(Encoders.STRING(), Encoders.kryo(RelatedEntity.class))) + .cache(); relsByTarget .joinWith(entities, entities.col("_1").equalTo(relsByTarget.col("_1")), "inner") @@ -149,8 +143,10 @@ public class CreateRelatedEntitiesJob_phase1 { re.setId(entity.getId()); re.setType(EntityType.fromClass(clazz).name()); - if (entity.getPid() != null) + // TODO move the max number of PIDs to eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits + if (Objects.nonNull(entity.getPid())) { re.setPid(entity.getPid().stream().limit(400).collect(Collectors.toList())); + } re.setCollectedfrom(entity.getCollectedfrom()); switch (EntityType.fromClass(clazz)) { @@ -212,7 +208,7 @@ public class CreateRelatedEntitiesJob_phase1 { final List> f = p.getFundingtree(); if (!f.isEmpty()) { - re.setFundingtree(f.stream().map(s -> s.getValue()).collect(Collectors.toList())); + re.setFundingtree(f.stream().map(Field::getValue).collect(Collectors.toList())); } break; } @@ -227,15 +223,16 @@ public class CreateRelatedEntitiesJob_phase1 { return Optional .ofNullable(f) .filter(Objects::nonNull) - .map(x -> x.getValue()) + .map(Field::getValue) .orElse(defaultValue); } /** - * Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text file, + * Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text + * file * - * @param spark - * @param relationPath + * @param spark the SparkSession + * @param relationPath the path storing the relation objects * @return the Dataset containing all the relationships */ private static Dataset readPathRelation( From bdffa86c2f5cb8571520c5bfd79c35e56a6950e0 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 19 Oct 2021 15:39:38 +0200 Subject: [PATCH 16/51] undo last commit --- .../dhp/broker/oa/IndexNotificationsJob.java | 75 +++++++++---------- .../notifications_only/oozie_app/workflow.xml | 1 - 2 files changed, 34 insertions(+), 42 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java index e8ef5dd3e..fb2e26ba2 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java @@ -17,7 +17,6 @@ import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; @@ -25,7 +24,6 @@ import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import org.apache.spark.util.LongAccumulator; -import org.elasticsearch.spark.rdd.api.java.JavaEsSpark; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -33,7 +31,11 @@ import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.broker.model.*; +import eu.dnetlib.dhp.broker.model.ConditionParams; +import eu.dnetlib.dhp.broker.model.Event; +import eu.dnetlib.dhp.broker.model.MappedFields; +import eu.dnetlib.dhp.broker.model.Notification; +import eu.dnetlib.dhp.broker.model.Subscription; import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; import eu.dnetlib.dhp.broker.oa.util.NotificationGroup; import eu.dnetlib.dhp.broker.oa.util.SubscriptionUtils; @@ -85,55 +87,46 @@ public class IndexNotificationsJob { final List subscriptions = listSubscriptions(brokerApiBaseUrl); - log.info("Number of subscriptions: {}", subscriptions.size()); + log.info("Number of subscriptions: " + subscriptions.size()); + + if (subscriptions.size() > 0) { + final Map>> conditionsMap = prepareConditionsMap(subscriptions); + + log.info("ConditionsMap: " + new ObjectMapper().writeValueAsString(conditionsMap)); - if (!subscriptions.isEmpty()) { final Encoder ngEncoder = Encoders.bean(NotificationGroup.class); final Encoder nEncoder = Encoders.bean(Notification.class); final Dataset notifications = ClusterUtils .readPath(spark, eventsPath, Event.class) .map( - (MapFunction) e -> generateNotifications(e, subscriptions, startTime), + (MapFunction) e -> generateNotifications( + e, subscriptions, conditionsMap, startTime), ngEncoder) .flatMap((FlatMapFunction) g -> g.getData().iterator(), nEncoder); - final JavaRDD inputRdd = notifications + notifications .map((MapFunction) n -> prepareForIndexing(n, total), Encoders.STRING()) - .javaRDD(); - - final Map esCfg = new HashMap<>(); - - esCfg.put("es.index.auto.create", "false"); - esCfg.put("es.nodes", indexHost); - esCfg.put("es.mapping.id", "notificationId"); // THE PRIMARY KEY - esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount); - esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait); - esCfg.put("es.batch.size.entries", esBatchSizeEntries); - esCfg.put("es.nodes.wan.only", esNodesWanOnly); - - log.info("*** Start indexing"); - JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg); - log.info("*** End indexing"); - - log.info("*** Deleting old notifications"); - final String message = deleteOldNotifications(brokerApiBaseUrl, startTime - 1000); - log.info("*** Deleted notifications: {}", message); - - log.info("*** sendNotifications (emails, ...)"); - sendNotifications(brokerApiBaseUrl, startTime - 1000); - log.info("*** ALL done."); - + .javaRDD() + .saveAsTextFile("/tmp/IndexNotificationsJob_test_6504"); } } - private static NotificationGroup generateNotifications(final Event e, + protected static Map>> prepareConditionsMap( + final List subscriptions) { + final Map>> map = new HashMap<>(); + subscriptions.forEach(s -> map.put(s.getSubscriptionId(), s.conditionsAsMap())); + return map; + } + + protected static NotificationGroup generateNotifications(final Event e, final List subscriptions, + final Map>> conditionsMap, final long date) { final List list = subscriptions .stream() .filter( s -> StringUtils.isBlank(s.getTopic()) || s.getTopic().equals("*") || s.getTopic().equals(e.getTopic())) - .filter(s -> verifyConditions(e.getMap(), s.conditionsAsMap())) + .filter(s -> verifyConditions(e.getMap(), conditionsMap.get(s.getSubscriptionId()))) .map(s -> generateNotification(s, e, date)) .collect(Collectors.toList()); @@ -169,28 +162,28 @@ public class IndexNotificationsJob { return false; } - if (conditions.containsKey("targetDateofacceptance") && conditions + if (conditions.containsKey("targetDateofacceptance") && !conditions .get("targetDateofacceptance") .stream() - .noneMatch( + .anyMatch( c -> SubscriptionUtils .verifyDateRange(map.getTargetDateofacceptance(), c.getValue(), c.getOtherValue()))) { return false; } if (conditions.containsKey("targetResultTitle") - && conditions + && !conditions .get("targetResultTitle") .stream() - .noneMatch(c -> SubscriptionUtils.verifySimilar(map.getTargetResultTitle(), c.getValue()))) { + .anyMatch(c -> SubscriptionUtils.verifySimilar(map.getTargetResultTitle(), c.getValue()))) { return false; } if (conditions.containsKey("targetAuthors") - && conditions + && !conditions .get("targetAuthors") .stream() - .noneMatch(c -> SubscriptionUtils.verifyListSimilar(map.getTargetAuthors(), c.getValue()))) { + .allMatch(c -> SubscriptionUtils.verifyListSimilar(map.getTargetAuthors(), c.getValue()))) { return false; } @@ -202,7 +195,7 @@ public class IndexNotificationsJob { } - private static List listSubscriptions(final String brokerApiBaseUrl) throws IOException { + private static List listSubscriptions(final String brokerApiBaseUrl) throws Exception { final String url = brokerApiBaseUrl + "/api/subscriptions"; final HttpGet req = new HttpGet(url); @@ -217,7 +210,7 @@ public class IndexNotificationsJob { } } - private static String deleteOldNotifications(final String brokerApiBaseUrl, final long l) throws IOException { + private static String deleteOldNotifications(final String brokerApiBaseUrl, final long l) throws Exception { final String url = brokerApiBaseUrl + "/api/notifications/byDate/0/" + l; final HttpDelete req = new HttpDelete(url); diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml index 248326d57..70ab7d344 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml @@ -135,7 +135,6 @@ --executor-memory=${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} - --conf spark.dynamicAllocation.maxExecutors="8" --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} From c4fce785ab623fcce1afaaecbbcefd45dbcf175f Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 19 Oct 2021 16:18:26 +0200 Subject: [PATCH 17/51] fixed a compilation problem of a unit test --- .../broker/oa/GenerateNotificationsJob.java | 22 +++++++++++++------ .../dhp/broker/oa/IndexNotificationsJob.java | 5 +++-- ...java => GenerateNotificationsJobTest.java} | 22 +++++++++---------- 3 files changed, 29 insertions(+), 20 deletions(-) rename dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/{IndexNotificationsJobTest.java => GenerateNotificationsJobTest.java} (81%) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateNotificationsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateNotificationsJob.java index 21c6c64a6..6b8d60f40 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateNotificationsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateNotificationsJob.java @@ -45,8 +45,9 @@ public class GenerateNotificationsJob { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils - .toString(GenerateNotificationsJob.class - .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_notifications.json"))); + .toString( + GenerateNotificationsJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_notifications.json"))); parser.parseArgument(args); final SparkConf conf = new SparkConf(); @@ -79,7 +80,10 @@ public class GenerateNotificationsJob { final Encoder nEncoder = Encoders.bean(Notification.class); final Dataset notifications = ClusterUtils .readPath(spark, eventsPath, Event.class) - .map((MapFunction) e -> generateNotifications(e, subscriptions, conditionsMap, startTime), ngEncoder) + .map( + (MapFunction) e -> generateNotifications( + e, subscriptions, conditionsMap, startTime), + ngEncoder) .flatMap((FlatMapFunction) g -> g.getData().iterator(), nEncoder); ClusterUtils.save(notifications, notificationsPath, Notification.class, total); @@ -99,7 +103,8 @@ public class GenerateNotificationsJob { final long date) { final List list = subscriptions .stream() - .filter(s -> StringUtils.isBlank(s.getTopic()) || s.getTopic().equals("*") || s.getTopic().equals(e.getTopic())) + .filter( + s -> StringUtils.isBlank(s.getTopic()) || s.getTopic().equals("*") || s.getTopic().equals(e.getTopic())) .filter(s -> verifyConditions(e.getMap(), conditionsMap.get(s.getSubscriptionId()))) .map(s -> generateNotification(s, e, date)) .collect(Collectors.toList()); @@ -130,15 +135,18 @@ public class GenerateNotificationsJob { if (conditions.containsKey("trust") && !SubscriptionUtils - .verifyFloatRange(map.getTrust(), conditions.get("trust").get(0).getValue(), conditions.get("trust").get(0).getOtherValue())) { + .verifyFloatRange( + map.getTrust(), conditions.get("trust").get(0).getValue(), + conditions.get("trust").get(0).getOtherValue())) { return false; } if (conditions.containsKey("targetDateofacceptance") && !conditions .get("targetDateofacceptance") .stream() - .anyMatch(c -> SubscriptionUtils - .verifyDateRange(map.getTargetDateofacceptance(), c.getValue(), c.getOtherValue()))) { + .anyMatch( + c -> SubscriptionUtils + .verifyDateRange(map.getTargetDateofacceptance(), c.getValue(), c.getOtherValue()))) { return false; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java index 55e82446f..a2aa30092 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java @@ -36,8 +36,9 @@ public class IndexNotificationsJob { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils - .toString(IndexNotificationsJob.class - .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/index_notifications.json"))); + .toString( + IndexNotificationsJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/index_notifications.json"))); parser.parseArgument(args); final SparkConf conf = new SparkConf(); diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJobTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/GenerateNotificationsJobTest.java similarity index 81% rename from dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJobTest.java rename to dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/GenerateNotificationsJobTest.java index 8ecb3061b..233963e2f 100644 --- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJobTest.java +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/GenerateNotificationsJobTest.java @@ -16,7 +16,7 @@ import eu.dnetlib.dhp.broker.model.MappedFields; import eu.dnetlib.dhp.broker.model.Subscription; import eu.dnetlib.dhp.broker.oa.util.NotificationGroup; -class IndexNotificationsJobTest { +class GenerateNotificationsJobTest { private List subscriptions; @@ -32,7 +32,7 @@ class IndexNotificationsJobTest { .setConditions( "[{\"field\":\"targetDatasourceName\",\"fieldType\":\"STRING\",\"operator\":\"EXACT\",\"listParams\":[{\"value\":\"reposiTUm\"}]},{\"field\":\"trust\",\"fieldType\":\"FLOAT\",\"operator\":\"RANGE\",\"listParams\":[{\"value\":\"0\",\"otherValue\":\"1\"}]}]"); subscriptions = Arrays.asList(s); - conditionsMap = IndexNotificationsJob.prepareConditionsMap(subscriptions); + conditionsMap = GenerateNotificationsJob.prepareConditionsMap(subscriptions); } @Test @@ -40,7 +40,7 @@ class IndexNotificationsJobTest { final Event event = new Event(); event.setTopic("ENRICH/MISSING/PROJECT"); - final NotificationGroup res = IndexNotificationsJob + final NotificationGroup res = GenerateNotificationsJob .generateNotifications(event, subscriptions, conditionsMap, 0); assertEquals(0, res.getData().size()); } @@ -53,7 +53,7 @@ class IndexNotificationsJobTest { event.getMap().setTargetDatasourceName("reposiTUm"); event.getMap().setTrust(0.8f); - final NotificationGroup res = IndexNotificationsJob + final NotificationGroup res = GenerateNotificationsJob .generateNotifications(event, subscriptions, conditionsMap, 0); assertEquals(1, res.getData().size()); } @@ -66,7 +66,7 @@ class IndexNotificationsJobTest { event.getMap().setTargetDatasourceName("Puma"); event.getMap().setTrust(0.8f); - final NotificationGroup res = IndexNotificationsJob + final NotificationGroup res = GenerateNotificationsJob .generateNotifications(event, subscriptions, conditionsMap, 0); assertEquals(0, res.getData().size()); } @@ -77,11 +77,11 @@ class IndexNotificationsJobTest { event.setTopic("ENRICH/MISSING/PROJECT"); // warm up - IndexNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0); + GenerateNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0); final long start = System.currentTimeMillis(); for (int i = 0; i < N_TIMES; i++) { - IndexNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0); + GenerateNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0); } final long end = System.currentTimeMillis(); System.out @@ -98,11 +98,11 @@ class IndexNotificationsJobTest { event.getMap().setTrust(0.8f); // warm up - IndexNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0); + GenerateNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0); final long start = System.currentTimeMillis(); for (int i = 0; i < N_TIMES; i++) { - IndexNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0); + GenerateNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0); } final long end = System.currentTimeMillis(); System.out @@ -118,11 +118,11 @@ class IndexNotificationsJobTest { event.getMap().setTrust(0.8f); // warm up - IndexNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0); + GenerateNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0); final long start = System.currentTimeMillis(); for (int i = 0; i < N_TIMES; i++) { - IndexNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0); + GenerateNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0); } final long end = System.currentTimeMillis(); System.out From 43f4eb492b1dc8c747fdf8dd90d22e6d2f98dcb1 Mon Sep 17 00:00:00 2001 From: antleb Date: Thu, 14 Oct 2021 18:53:35 +0300 Subject: [PATCH 18/51] fetching affiliated results for 4 orgs in monitor. fixed affiliated orgs in stats db --- .../oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql | 2 +- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 9ea50d488..d39af37d5 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -17,7 +17,7 @@ create table TARGET.result as union all select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) union all - select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on p.id=rp.project join SOURCE.project_organizations po on po.id=p.id where rp.id=r.id and po.organization in ( + select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( 'openorgs____::759d59f05d77188faee99b7493b46805', 'openorgs____::b84450f9864182c67b8611b5593f4250', 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index b3cbc9b41..d461bd343 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -126,7 +126,7 @@ FROM ${stats_db_name}.otherresearchproduct_topics; CREATE TABLE ${stats_db_name}.result_organization AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r -WHERE r.reltype = 'resultOrganization' +WHERE r.reltype = 'resultOrganization' and r.relclass='hasAuthorInstitution' and r.subreltype='affiliation' and r.datainfo.deletedbyinference = false; CREATE TABLE ${stats_db_name}.result_projects AS From a7376907c2bc02b363912392cb06fcb177c53a92 Mon Sep 17 00:00:00 2001 From: antleb Date: Fri, 15 Oct 2021 13:42:55 +0300 Subject: [PATCH 19/51] invalidating medatadata before context thingies --- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh index 6d42ab13d..2a40eb920 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh @@ -26,6 +26,8 @@ hdfs dfs -copyFromLocal concepts.csv ${TMP} hdfs dfs -chmod -R 777 ${TMP} echo "Creating and populating impala tables" +impala-shell -q "invalidate metadata" +impala-shell -d ${TARGET_DB} -q "invalidate metadata" impala-shell -q "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','" impala-shell -q "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','" impala-shell -q "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','" From cece432adc0b4c929e768a94359d024f7e59fbef Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 20 Oct 2021 14:14:53 +0200 Subject: [PATCH 20/51] [stats] reducing the step22 wait time --- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateCache.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateCache.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateCache.sh index dc19f84b4..03aa535e1 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateCache.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateCache.sh @@ -1,4 +1,4 @@ #!/usr/bin/env bash curl --request GET $1/cache/updateCache -sleep 20h \ No newline at end of file +sleep 6h \ No newline at end of file From b97b78f874ee744e87f1bc4d157cbeb6dfb569a9 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Mon, 1 Nov 2021 15:42:29 +0200 Subject: [PATCH 21/51] removed hardcoded reference --- .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index b977302df..e892da0be 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -65,7 +65,7 @@ FROM ${stats_db_name}.project_tmp p UPDATE ${stats_db_name}.publication_tmp SET delayed = 'yes' WHERE publication_tmp.id IN (SELECT distinct r.id - FROM stats_wf_db_obs.result r, + FROM ${stats_db_name}.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id = pr.result @@ -75,7 +75,7 @@ WHERE publication_tmp.id IN (SELECT distinct r.id UPDATE ${stats_db_name}.dataset_tmp SET delayed = 'yes' WHERE dataset_tmp.id IN (SELECT distinct r.id - FROM stats_wf_db_obs.result r, + FROM ${stats_db_name}.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id = pr.result From b9d4f115ccf8d6bb3d2b033a3ae351be2f575932 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Tue, 9 Nov 2021 12:04:45 +0100 Subject: [PATCH 22/51] fixed Crossref mappign for SFI projects --- .../main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index 25f0ff381..91d4bd4d3 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -358,7 +358,7 @@ case object Crossref2Oaf { case "10.13039/100000001" => generateSimpleRelationFromAward(funder, "nsf_________", a => a) case "10.13039/501100001665" => generateSimpleRelationFromAward(funder, "anr_________", a => a) case "10.13039/501100002341" => generateSimpleRelationFromAward(funder, "aka_________", a => a) - case "10.13039/501100001602" => generateSimpleRelationFromAward(funder, "aka_________", a => a.replace("SFI", "")) + case "10.13039/501100001602" => generateSimpleRelationFromAward(funder, "sfi_________", a => a.replace("SFI", "")) case "10.13039/501100000923" => generateSimpleRelationFromAward(funder, "arc_________", a => a) case "10.13039/501100000038"=> val targetId = getProjectId("nserc_______" , "1e5e62235d094afd01cd56e65112fc63") queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) From 6cd91004e33c0eda2fa6f1381a1a65cd21f8e40f Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Tue, 9 Nov 2021 12:22:57 +0100 Subject: [PATCH 23/51] fixed DOI for Wellcome Trust in mapping relationships from Crossref --- .../main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index 91d4bd4d3..1404469ef 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -382,8 +382,8 @@ case object Crossref2Oaf { case "10.13039/501100006769"=> generateSimpleRelationFromAward(funder, "rsf_________", a=>a) case "10.13039/501100001711"=> generateSimpleRelationFromAward(funder, "snsf________", snsfRule) case "10.13039/501100004410"=> generateSimpleRelationFromAward(funder, "tubitakf____", a =>a) - case "10.10.13039/100004440"=> generateSimpleRelationFromAward(funder, "wt__________", a =>a) - case "10.13039/100004440"=> val targetId = getProjectId("wt__________" , "1e5e62235d094afd01cd56e65112fc63") + case "10.13039/100004440" => generateSimpleRelationFromAward(funder, "wt__________", a =>a) + val targetId = getProjectId("wt__________" , "1e5e62235d094afd01cd56e65112fc63") queue += generateRelation(sourceId,targetId, ModelConstants.IS_PRODUCED_BY) queue += generateRelation(targetId,sourceId, ModelConstants.PRODUCES) From fc8fceaac3c5c4f70289ed42735d9d1565f7c385 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Wed, 10 Nov 2021 14:11:52 +0100 Subject: [PATCH 24/51] create direct link to WT projects as well --- .../main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index 1404469ef..0ce55a96b 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -402,7 +402,8 @@ case object Crossref2Oaf { case "The French National Research Agency (ANR)" | "The French National Research Agency" => generateSimpleRelationFromAward(funder, "anr_________", a => a) case "CONICYT, Programa de Formación de Capital Humano Avanzado" => generateSimpleRelationFromAward(funder, "conicytf____", extractECAward) - case "Wellcome Trust Masters Fellowship" => val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63") + case "Wellcome Trust Masters Fellowship" => generateSimpleRelationFromAward(funder, "wt__________", a =>a) + val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63") queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY ) queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES ) case _ => logger.debug("no match for "+funder.name ) From 3f9b2ba8ce7d987e3bb65d1d76bc517636362814 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 22 Nov 2021 16:59:43 +0100 Subject: [PATCH 25/51] [Hosted By Map] fix issue in test --- .../java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala index 1bdcb60aa..4613d5636 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala @@ -54,7 +54,7 @@ class TestApply extends java.io.Serializable{ assertTrue(pa.getInstance().get(0).getHostedby.getValue.equals("Academic Therapy")) assertTrue(pa.getInstance().get(0).getAccessright.getClassid.equals("OPEN")) assertTrue(pa.getInstance().get(0).getAccessright.getClassname.equals("Open Access")) - assertTrue(pa.getInstance().get(0).getAccessright.getOpenAccessRoute.equals(OpenAccessRoute.hybrid)) + assertTrue(pa.getInstance().get(0).getAccessright.getOpenAccessRoute.equals(OpenAccessRoute.gold)) assertTrue(pa.getBestaccessright.getClassid.equals("OPEN")) assertTrue(pa.getBestaccessright.getClassname.equals("Open Access")) From cd9c51fd7a139cb5b410af8f83ce855c6f49aaee Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 9 Dec 2021 13:57:53 +0100 Subject: [PATCH 26/51] vocabulary based cleaning considers also the term label when looking up for a synonym --- .../common/vocabulary/VocabularyGroup.java | 8 ++ .../clean/GraphCleaningFunctionsTest.java | 7 +- .../eu/dnetlib/dhp/oa/graph/clean/result.json | 86 +++++++++++++++++++ 3 files changed, 99 insertions(+), 2 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java index d5f57849c..1c129ff9c 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java @@ -57,9 +57,17 @@ public class VocabularyGroup implements Serializable { final String syn = arr[2].trim(); vocs.addSynonyms(vocId, termId, syn); + } } + // add the term names as synonyms + vocs.vocs.values().forEach(voc -> { + voc.getTerms().values().forEach(term -> { + voc.addSynonym(term.getName().toLowerCase(), term.getId()); + }); + }); + return vocs; } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java index c8a368dd6..f822daa0e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java @@ -101,6 +101,9 @@ public class GraphCleaningFunctionsTest { assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid()); assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname()); + assertEquals("0033", p_out.getInstance().get(1).getInstancetype().getClassid()); + assertEquals("Audiovisual", p_out.getInstance().get(1).getInstancetype().getClassname()); + assertEquals("CLOSED", p_out.getInstance().get(0).getAccessright().getClassid()); assertEquals("Closed Access", p_out.getInstance().get(0).getAccessright().getClassname()); @@ -114,7 +117,7 @@ public class GraphCleaningFunctionsTest { List poi = p_out.getInstance(); assertNotNull(poi); - assertEquals(1, poi.size()); + assertEquals(2, poi.size()); final Instance poii = poi.get(0); assertNotNull(poii); @@ -163,7 +166,7 @@ public class GraphCleaningFunctionsTest { final List pci = p_cleaned.getInstance(); assertNotNull(pci); - assertEquals(1, pci.size()); + assertEquals(2, pci.size()); final Instance pcii = pci.get(0); assertNotNull(pcii); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index b3e302474..5b9e86c65 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -403,6 +403,92 @@ "http://juuli.fi/Record/0275158616", "http://dx.doi.org/10.1007/s109090161569x" ] + }, + { + "pid": [ + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1002/s21010127267xy" + }, + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1008/abcd" + } + ], + "alternateIdentifier": [ + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1007/s109090161569x" + }, + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1009/qwerty" + } + ], + "accessright": { + "classid": "CLOSED", + "classname": "CLOSED", + "schemeid": "dnet:access_modes", + "schemename": "dnet:access_modes" + }, + "collectedfrom": { + "key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747", + "value": "VIRTA" + }, + "dateofacceptance": { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "value": "2016-01-01" + }, + "distributionlocation": "", + "hostedby": { + "key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747", + "value": "VIRTA" + }, + "instancetype": { + "classid": "Audiovisual", + "classname": "Audiovisual", + "schemeid": "dnet:publication_resource", + "schemename": "dnet:publication_resource" + }, + "url": [ + "http://dx.doi.org/10.1002/s21010127267xy" + ] } ], "journal": { From c1b6ae47cd4af0bd2db0901b8c1c623ef3800d5d Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 9 Dec 2021 16:44:28 +0100 Subject: [PATCH 27/51] cleaning workflow assigns the proper default instance type when a value could not be cleaned using the vocabularies --- .../oaf/utils/GraphCleaningFunctions.java | 36 +++++++- .../oa/graph/clean/CleanGraphSparkJob.java | 2 +- .../clean/GraphCleaningFunctionsTest.java | 20 +++-- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 2 +- .../eu/dnetlib/dhp/oa/graph/clean/result.json | 90 ++++++++++++++++++- 5 files changed, 138 insertions(+), 12 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index 592580ab8..f49a47c0a 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -16,6 +16,8 @@ import com.github.sisyphsu.dateparser.DateParserUtils; import com.google.common.collect.Lists; import com.google.common.collect.Sets; +import eu.dnetlib.dhp.common.vocabulary.Vocabulary; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; @@ -115,7 +117,7 @@ public class GraphCleaningFunctions extends CleaningFunctions { return true; } - public static T cleanup(T value) { + public static T cleanup(T value, VocabularyGroup vocs) { if (value instanceof Datasource) { // nothing to clean here } else if (value instanceof Project) { @@ -234,6 +236,38 @@ public class GraphCleaningFunctions extends CleaningFunctions { if (Objects.nonNull(r.getInstance())) { for (Instance i : r.getInstance()) { + if (!vocs.termExists(ModelConstants.DNET_PUBLICATION_RESOURCE, i.getInstancetype().getClassid())) { + if (r instanceof Publication) { + i + .setInstancetype( + OafMapperUtils + .qualifier( + "0038", "Other literature type", ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE)); + } else if (r instanceof Dataset) { + i + .setInstancetype( + OafMapperUtils + .qualifier( + "0039", "Other dataset type", ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE)); + } else if (r instanceof Software) { + i + .setInstancetype( + OafMapperUtils + .qualifier( + "0040", "Other software type", ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE)); + } else if (r instanceof OtherResearchProduct) { + i + .setInstancetype( + OafMapperUtils + .qualifier( + "0020", "Other ORP type", ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE)); + } + } + if (Objects.nonNull(i.getPid())) { i.setPid(processPidCleaning(i.getPid())); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java index d43d7ce28..2e2ea567a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java @@ -88,7 +88,7 @@ public class CleanGraphSparkJob { readTableFromPath(spark, inputPath, clazz) .map((MapFunction) GraphCleaningFunctions::fixVocabularyNames, Encoders.bean(clazz)) .map((MapFunction) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz)) - .map((MapFunction) GraphCleaningFunctions::cleanup, Encoders.bean(clazz)) + .map((MapFunction) value -> GraphCleaningFunctions.cleanup(value, vocs), Encoders.bean(clazz)) .filter((FilterFunction) GraphCleaningFunctions::filter) .write() .mode(SaveMode.Overwrite) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java index f822daa0e..0e9efed21 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java @@ -101,8 +101,11 @@ public class GraphCleaningFunctionsTest { assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid()); assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname()); - assertEquals("0033", p_out.getInstance().get(1).getInstancetype().getClassid()); - assertEquals("Audiovisual", p_out.getInstance().get(1).getInstancetype().getClassname()); + assertEquals("0027", p_out.getInstance().get(1).getInstancetype().getClassid()); + assertEquals("Model", p_out.getInstance().get(1).getInstancetype().getClassname()); + + assertEquals("xyz", p_out.getInstance().get(2).getInstancetype().getClassid()); + assertEquals("xyz", p_out.getInstance().get(2).getInstancetype().getClassname()); assertEquals("CLOSED", p_out.getInstance().get(0).getAccessright().getClassid()); assertEquals("Closed Access", p_out.getInstance().get(0).getAccessright().getClassname()); @@ -117,7 +120,7 @@ public class GraphCleaningFunctionsTest { List poi = p_out.getInstance(); assertNotNull(poi); - assertEquals(2, poi.size()); + assertEquals(3, poi.size()); final Instance poii = poi.get(0); assertNotNull(poii); @@ -145,7 +148,7 @@ public class GraphCleaningFunctionsTest { assertEquals(5, p_out.getTitle().size()); - Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out); + Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies); assertEquals(3, p_cleaned.getTitle().size()); @@ -164,9 +167,12 @@ public class GraphCleaningFunctionsTest { assertEquals("1970-10-07", p_cleaned.getDateofacceptance().getValue()); + assertEquals("0038", p_cleaned.getInstance().get(2).getInstancetype().getClassid()); + assertEquals("Other literature type", p_cleaned.getInstance().get(2).getInstancetype().getClassname()); + final List pci = p_cleaned.getInstance(); assertNotNull(pci); - assertEquals(2, pci.size()); + assertEquals(3, pci.size()); final Instance pcii = pci.get(0); assertNotNull(pcii); @@ -234,7 +240,7 @@ public class GraphCleaningFunctionsTest { .toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/doiboostpub.json")); Publication p_in = MAPPER.readValue(json, Publication.class); Publication p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping); - Publication cleaned = GraphCleaningFunctions.cleanup(p_out); + Publication cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies); Assertions.assertEquals(true, GraphCleaningFunctions.filter(cleaned)); } @@ -245,7 +251,7 @@ public class GraphCleaningFunctionsTest { .toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/doiboostpub2.json")); Publication p_in = MAPPER.readValue(json, Publication.class); Publication p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping); - Publication cleaned = GraphCleaningFunctions.cleanup(p_out); + Publication cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies); Assertions.assertEquals(true, GraphCleaningFunctions.filter(cleaned)); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 27e33bf27..de79b750a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -708,7 +708,7 @@ class MappersTest { assertEquals(1, p.getTitle().size()); assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); - final Publication p_cleaned = cleanup(fixVocabularyNames(p)); + final Publication p_cleaned = cleanup(fixVocabularyNames(p), vocs); assertNotNull(p_cleaned.getTitle()); assertFalse(p_cleaned.getTitle().isEmpty()); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index 5b9e86c65..78fdc4c9d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -481,14 +481,100 @@ "value": "VIRTA" }, "instancetype": { - "classid": "Audiovisual", - "classname": "Audiovisual", + "classid": "Model", + "classname": "Model", "schemeid": "dnet:publication_resource", "schemename": "dnet:publication_resource" }, "url": [ "http://dx.doi.org/10.1002/s21010127267xy" ] + }, + { + "pid": [ + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1002/s21010127267xy" + }, + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1008/abcd" + } + ], + "alternateIdentifier": [ + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1007/s109090161569x" + }, + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1009/qwerty" + } + ], + "accessright": { + "classid": "CLOSED", + "classname": "CLOSED", + "schemeid": "dnet:access_modes", + "schemename": "dnet:access_modes" + }, + "collectedfrom": { + "key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747", + "value": "VIRTA" + }, + "dateofacceptance": { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "value": "2016-01-01" + }, + "distributionlocation": "", + "hostedby": { + "key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747", + "value": "VIRTA" + }, + "instancetype": { + "classid": "xyz", + "classname": "xyz", + "schemeid": "dnet:publication_resource", + "schemename": "dnet:publication_resource" + }, + "url": [ + "http://dx.doi.org/10.1002/t32121238378t" + ] } ], "journal": { From f0b523cfa7eb64c6adca18bee27ce6d0f80d7ecd Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Thu, 11 Nov 2021 12:57:19 +0200 Subject: [PATCH 28/51] removed the too restrctive clause. will discuss again --- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index d461bd343..b3cbc9b41 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -126,7 +126,7 @@ FROM ${stats_db_name}.otherresearchproduct_topics; CREATE TABLE ${stats_db_name}.result_organization AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r -WHERE r.reltype = 'resultOrganization' and r.relclass='hasAuthorInstitution' and r.subreltype='affiliation' +WHERE r.reltype = 'resultOrganization' and r.datainfo.deletedbyinference = false; CREATE TABLE ${stats_db_name}.result_projects AS From 9458ee793834c878c1c2d6f2ca12c52f70dc34dc Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 3 Jan 2022 17:25:26 +0100 Subject: [PATCH 29/51] serialise records in the OAF-store-graph mdstores in json format. Read them again in the graph construction phase using a tolerant parser to support backward compatible changes in the evolution of the schema --- .../dhp/collection/CollectionUtils.scala | 13 +++++++ .../GenerateDataciteDatasetSpark.scala | 16 ++++----- .../bio/SparkTransformBioDatabaseToOAF.scala | 14 ++++---- .../ebi/SparkCreateBaselineDataFrame.scala | 16 +++++---- .../dhp/sx/bio/ebi/SparkEBILinksToOaf.scala | 10 +++--- .../dhp/sx/bio/pubmed/PubMedToOaf.scala | 2 +- .../raw/CopyHdfsOafSparkApplication.scala | 36 +++++++++++++------ 7 files changed, 68 insertions(+), 39 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala index 11ecfd6cb..26fcecbbd 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala @@ -1,7 +1,9 @@ package eu.dnetlib.dhp.collection +import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.schema.common.ModelSupport import eu.dnetlib.dhp.schema.oaf.{Oaf, OafEntity, Relation} +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode} object CollectionUtils { @@ -46,4 +48,15 @@ object CollectionUtils { List() } + def saveDataset(d: Dataset[Oaf], targetPath: String):Unit = { + implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) + val mapper = new ObjectMapper + + d + .flatMap(i => CollectionUtils.fixRelations(i)) + .filter(i => i != null) + .map(r => mapper.writeValueAsString(r))(Encoders.STRING) + .write.mode(SaveMode.Overwrite).save(targetPath) + } + } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala index e1607ee9c..d11c33fb4 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala @@ -2,14 +2,14 @@ package eu.dnetlib.dhp.datacite import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.AbstractScalaApplication -import eu.dnetlib.dhp.collection.CollectionUtils.fixRelations +import eu.dnetlib.dhp.collection.CollectionUtils import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH} import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.schema.mdstore.{MDStoreVersion, MetadataRecord} import eu.dnetlib.dhp.schema.oaf.Oaf import eu.dnetlib.dhp.utils.DHPUtils.writeHdfsFile import eu.dnetlib.dhp.utils.ISLookupClientFactory -import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql.{Encoder, Encoders, SparkSession} import org.slf4j.{Logger, LoggerFactory} @@ -73,12 +73,12 @@ class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord] implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] - spark.read.load(sourcePath).as[DataciteType] - .filter(d => d.isActive) - .flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks)) - .filter(d => d != null) - .flatMap(i => fixRelations(i)).filter(i => i != null) - .write.mode(SaveMode.Overwrite).save(targetPath) + CollectionUtils.saveDataset( + spark.read.load(sourcePath).as[DataciteType] + .filter(d => d.isActive) + .flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks)) + .filter(d => d != null), + targetPath) } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala index 8ae8285e3..27caa8f36 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala @@ -1,12 +1,12 @@ package eu.dnetlib.dhp.sx.bio import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.Oaf -import BioDBToOAF.ScholixResolved import eu.dnetlib.dhp.collection.CollectionUtils +import eu.dnetlib.dhp.schema.oaf.Oaf +import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf -import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql.{Encoder, Encoders, SparkSession} import org.slf4j.{Logger, LoggerFactory} object SparkTransformBioDatabaseToOAF { @@ -36,13 +36,13 @@ object SparkTransformBioDatabaseToOAF { import spark.implicits._ database.toUpperCase() match { case "UNIPROT" => - spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) + CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))), targetPath) case "PDB" => - spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) + CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))), targetPath) case "SCHOLIX" => - spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) + CollectionUtils.saveDataset(spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)), targetPath) case "CROSSREF_LINKS" => - spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) + CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))), targetPath) } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala index 17d21f19c..0fea4ff7f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala @@ -1,9 +1,10 @@ package eu.dnetlib.dhp.sx.bio.ebi import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.collection.CollectionUtils import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup -import eu.dnetlib.dhp.schema.oaf.Result -import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PubMedToOaf} +import eu.dnetlib.dhp.schema.oaf.{Oaf, Result} +import eu.dnetlib.dhp.sx.bio.pubmed._ import eu.dnetlib.dhp.utils.ISLookupClientFactory import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration @@ -177,7 +178,7 @@ object SparkCreateBaselineDataFrame { implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle]) implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal]) implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor]) - implicit val resultEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) + implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) if (!"true".equalsIgnoreCase(skipUpdate)) { downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri) @@ -192,9 +193,10 @@ object SparkCreateBaselineDataFrame { } val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle] - exported_dataset - .map(a => PubMedToOaf.convert(a, vocabularies)).as[Result] - .filter(p => p != null) - .write.mode(SaveMode.Overwrite).save(targetPath) + CollectionUtils.saveDataset(exported_dataset + .map(a => PubMedToOaf.convert(a, vocabularies)).as[Oaf] + .filter(p => p != null), + targetPath) + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala index 8da617ca0..cd03f004d 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala @@ -1,11 +1,10 @@ package eu.dnetlib.dhp.sx.bio.ebi import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.collection.CollectionUtils import eu.dnetlib.dhp.schema.oaf.Oaf import eu.dnetlib.dhp.sx.bio.BioDBToOAF import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem -import BioDBToOAF.EBILinkItem -import eu.dnetlib.dhp.collection.CollectionUtils import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.sql._ @@ -35,10 +34,9 @@ object SparkEBILinksToOaf { val ebLinks: Dataset[EBILinkItem] = spark.read.load(sourcePath).as[EBILinkItem].filter(l => l.links != null && l.links.startsWith("{")) - ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links)) + CollectionUtils.saveDataset(ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links)) .filter(p => BioDBToOAF.EBITargetLinksFilter(p)) - .flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)) - .flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null) - .write.mode(SaveMode.Overwrite).save(targetPath) + .flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)), + targetPath) } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala index ecef32202..d09237842 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala @@ -111,7 +111,7 @@ object PubMedToOaf { * @param vocabularies the vocabularies * @return The OAF instance if the mapping did not fail */ - def convert(article: PMArticle, vocabularies: VocabularyGroup): Result = { + def convert(article: PMArticle, vocabularies: VocabularyGroup): Oaf = { if (article.getPublicationTypes == null) return null diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala index c7ad1890d..1376c6b35 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala @@ -1,18 +1,15 @@ package eu.dnetlib.dhp.oa.graph.raw -import com.fasterxml.jackson.databind.ObjectMapper +import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper} import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.common.HdfsSupport import eu.dnetlib.dhp.schema.common.ModelSupport -import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo import eu.dnetlib.dhp.schema.oaf.Oaf import eu.dnetlib.dhp.utils.DHPUtils -import org.apache.commons.io.IOUtils -import org.apache.commons.lang3.StringUtils -import org.apache.http.client.methods.HttpGet -import org.apache.http.impl.client.HttpClients import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.{SparkConf, SparkContext} +import org.json4s.DefaultFormats +import org.json4s.jackson.JsonMethods.parse import org.slf4j.LoggerFactory import scala.collection.JavaConverters._ @@ -51,18 +48,21 @@ object CopyHdfsOafSparkApplication { log.info("hdfsPath: {}", hdfsPath) implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + import spark.implicits._ val paths = DHPUtils.mdstorePaths(mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation, true).asScala val validPaths: List[String] = paths.filter(p => HdfsSupport.exists(p, sc.hadoopConfiguration)).toList if (validPaths.nonEmpty) { - val oaf = spark.read.load(validPaths: _*).as[Oaf] - val mapper = new ObjectMapper() - val l =ModelSupport.oafTypes.entrySet.asScala.map(e => e.getKey).toList + val oaf = spark.read.load(validPaths: _*).as[String] + val mapper = new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) + val l = ModelSupport.oafTypes.entrySet.asScala.toList l.foreach( e => - oaf.filter(o => o.getClass.getSimpleName.equalsIgnoreCase(e)) + oaf + .filter(o => isOafType(o, e.getKey)) + .map(j => mapper.readValue(j, e.getValue).asInstanceOf[Oaf]) .map(s => mapper.writeValueAsString(s))(Encoders.STRING) .write .option("compression", "gzip") @@ -71,4 +71,20 @@ object CopyHdfsOafSparkApplication { ) } } + + def isOafType(input: String, oafType: String): Boolean = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: org.json4s.JValue = parse(input) + if (oafType == "relation") { + val hasSource = (json \ "source").extractOrElse[String](null) + val hasTarget = (json \ "target").extractOrElse[String](null) + + hasSource != null && hasTarget != null + } else { + val hasId = (json \ "id").extractOrElse[String](null) + val resultType = (json \ "resulttype" \ "classid").extractOrElse[String](null) + hasId != null && oafType.equalsIgnoreCase(resultType) + } + + } } From 8d13effa311565955a2b625d652f44551baabdab Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 4 Jan 2022 11:26:56 +0100 Subject: [PATCH 30/51] test for the tolerant deserialisation utility method --- .../raw/CopyHdfsOafSparkApplicationTest.java | 65 +++++++++++++++++++ .../dnetlib/dhp/oa/graph/raw/dataset_1.json | 3 + .../dhp/oa/graph/raw/publication_1.json | 1 + .../raw/publication_2_unknownProperty.json | 1 + .../dnetlib/dhp/oa/graph/raw/relation_1.json | 6 ++ 5 files changed, 76 insertions(+) create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplicationTest.java create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/dataset_1.json create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/publication_1.json create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/publication_2_unknownProperty.json create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/relation_1.json diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplicationTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplicationTest.java new file mode 100644 index 000000000..85cb551bc --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplicationTest.java @@ -0,0 +1,65 @@ + +package eu.dnetlib.dhp.oa.graph.raw; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.IOException; + +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.Test; + +public class CopyHdfsOafSparkApplicationTest { + + @Test + void testIsOafType() throws IOException { + assertTrue( + CopyHdfsOafSparkApplication + .isOafType( + IOUtils + .toString( + getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/raw/publication_1.json")), + "publication")); + assertTrue( + CopyHdfsOafSparkApplication + .isOafType( + IOUtils + .toString( + getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/raw/dataset_1.json")), + "dataset")); + assertTrue( + CopyHdfsOafSparkApplication + .isOafType( + IOUtils + .toString( + getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/raw/relation_1.json")), + "relation")); + + assertFalse( + CopyHdfsOafSparkApplication + .isOafType( + IOUtils + .toString( + getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/raw/publication_1.json")), + "dataset")); + assertFalse( + CopyHdfsOafSparkApplication + .isOafType( + IOUtils + .toString( + getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/raw/dataset_1.json")), + "publication")); + + assertTrue( + CopyHdfsOafSparkApplication + .isOafType( + IOUtils + .toString( + getClass() + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/raw/publication_2_unknownProperty.json")), + "publication")); + + } + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/dataset_1.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/dataset_1.json new file mode 100644 index 000000000..c22dc94e3 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/dataset_1.json @@ -0,0 +1,3 @@ +{"author":[{"affiliation":[],"fullname":"Greenough, B","name":"B","pid":[],"rank":1,"surname":"Greenough"}],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofcollection":"2021-09-25T10:55:00.639Z","dateoftransformation":"2021-09-25T11:00:04.201Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Heritage Education"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|DansKnawCris::09821844208a5cd6300b2bfb13bca1b9","instance":[{"accessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"urn:nbn:nl:ui:13-59-cjhf"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.17632/96bpgw5j9d.1"}],"collectedfrom":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"hostedby":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"alternateIdentifier":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["","http://dx.doi.org/10.17632/96bpgw5j9d.1"]}],"language":{"classid":"und","classname":"Undetermined","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635434801681,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2021-08-16T15:29:45Z","harvestDate":"2021-09-25T10:55:00.639Z","identifier":"oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:211323","metadataNamespace":""}},"originalId":["50|DansKnawCris::09821844208a5cd6300b2bfb13bca1b9","oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:211323"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"0021","classname":"0021","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Interdisciplinary sciences"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Interdisciplinary sciences"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Heritage Education"}]} +{"author":[{"affiliation":[],"fullname":"Keijers, D.M.G.","name":"D.M.G.","pid":[],"rank":1,"surname":"Keijers"}],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofcollection":"2021-09-25T10:41:59.767Z","dateoftransformation":"2021-09-25T11:00:19.238Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"onderzoeksrapport"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|DansKnawCris::0dd644304b7116e8e58da3a5e3adc37a","instance":[{"accessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"urn:nbn:nl:ui:13-das-fkq"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.17026/dans-xsw-qtnx"}],"collectedfrom":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"hostedby":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"alternateIdentifier":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["","http://dx.doi.org/10.17026/dans-xsw-qtnx"]}],"language":{"classid":"dut/nld","classname":"Dutch; Flemish","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635434847381,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2021-08-16T13:53:29Z","harvestDate":"2021-09-25T10:41:59.767Z","identifier":"oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:20759","metadataNamespace":""}},"originalId":["oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:20759","50|DansKnawCris::0dd644304b7116e8e58da3a5e3adc37a"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"0021","classname":"0021","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"PROSPECTIE"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Plangebied Lange Ekker te Vessem, gemeente Eersel"}]} +{"author":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofcollection":"2021-09-25T10:43:13.768Z","dateoftransformation":"2021-09-25T11:01:22.863Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"This find is registered at Portable Antiquities of the Netherlands with number PAN-00054604"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c","instance":[{"accessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"urn:nbn:nl:ui:13-a7-hwgy"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.17026/dans-x3z-fsq5"}],"collectedfrom":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"hostedby":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"alternateIdentifier":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["","http://dx.doi.org/10.17026/dans-x3z-fsq5"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635434508886,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2021-08-16T14:01:37Z","harvestDate":"2021-09-25T10:43:13.768Z","identifier":"oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:129566","metadataNamespace":""}},"originalId":["oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:129566","50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"0021","classname":"0021","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"early medieval enamelled disc brooch variant A9"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Broader Match: disc brooches"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Broader Match: schijffibula - geemailleerd"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"metal"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"copper alloy"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Temporal coverage: Early Middle Ages C"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Temporal coverage: Early Middle Ages D"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Temporal coverage: 800 until 1000"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"PAN-00054604 - early medieval enamelled disc brooch variant A9"}]} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/publication_1.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/publication_1.json new file mode 100644 index 000000000..bd3710234 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/publication_1.json @@ -0,0 +1 @@ +{"author":[{"fullname":"Makkonen, Lasse","name":"Lasse","pid":[],"rank":1,"surname":"Makkonen"}],"bestaccessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"}],"context":[],"contributor":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Watson, Rick"}],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"1997-01-01"},"dateofcollection":"2021-10-04T12:42:57.502Z","dateoftransformation":"2021-10-04T15:32:51.877Z","description":[],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|355e65625b88::0b579a3501cf87921448e0a1c7fc8353","instance":[{"accessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"1997-01-01"},"distributionlocation":"","hostedby":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"instancetype":{"classid":"0005","classname":"Contribution for newspaper or weekly magazine","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/5a6fbe30-2096-4106-96f4-ed36620d3f73"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635433424020,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2020-10-26T12:07:44Z","harvestDate":"2021-10-04T12:42:57.502Z","identifier":"oai:cris.vtt.fi:publications/5a6fbe30-2096-4106-96f4-ed36620d3f73","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["50|355e65625b88::0b579a3501cf87921448e0a1c7fc8353","oai:cris.vtt.fi:publications/5a6fbe30-2096-4106-96f4-ed36620d3f73"],"pid":[],"publisher":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Irish Wind Energy Association"},"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"Unknown","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Makkonen , L 1997 , Calculation of ice accretion on wind turbine blades . in R Watson (ed.) , EWEC '97: European Wind Energy Conference : Proceedings of the international conference . Irish Wind Energy Association , Slane , European Wind Energy Conference EWEC '97 , Dublin , Ireland , 6/10/97 ."}],"subject":[],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Calculation of ice accretion on wind turbine blades"}]} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/publication_2_unknownProperty.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/publication_2_unknownProperty.json new file mode 100644 index 000000000..364796c24 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/publication_2_unknownProperty.json @@ -0,0 +1 @@ +{"id":"50|355e65625b88::0b579a3501cf87921448e0a1c7fc8353", "resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"}, "unknownProperty": "asdasd"} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/relation_1.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/relation_1.json new file mode 100644 index 000000000..31755c53d --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/relation_1.json @@ -0,0 +1,6 @@ +{"source":"1a","target":"10a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]} +{"source":"10a","target":"1a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]} +{"source":"2a","target":"20a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]} +{"source":"20a","target":"2a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]} +{"source":"15a","target":"25a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]} +{"source":"25a","target":"15a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]} \ No newline at end of file From f82db765db357a76fa2421ca4e430eb6e8ef5d67 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 4 Jan 2022 14:23:01 +0100 Subject: [PATCH 31/51] OAF-store-graph mdstores: save them in text format --- .../java/eu/dnetlib/dhp/collection/CollectionUtils.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala index 26fcecbbd..509e464e5 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala @@ -56,7 +56,10 @@ object CollectionUtils { .flatMap(i => CollectionUtils.fixRelations(i)) .filter(i => i != null) .map(r => mapper.writeValueAsString(r))(Encoders.STRING) - .write.mode(SaveMode.Overwrite).save(targetPath) + .write + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .text(targetPath) } } From 3dc48c7ab51fcf843c615614db9e5ea9eec6cb4b Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 4 Jan 2022 14:40:16 +0100 Subject: [PATCH 32/51] OAF-store-graph mdstores: save them in text format --- .../java/eu/dnetlib/dhp/collection/CollectionUtils.scala | 8 +++----- .../dhp/datacite/GenerateDataciteDatasetSpark.scala | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala index 509e464e5..e7de60e07 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala @@ -48,18 +48,16 @@ object CollectionUtils { List() } - def saveDataset(d: Dataset[Oaf], targetPath: String):Unit = { + def saveDataset(dataset: Dataset[Oaf], targetPath: String): Unit = { implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) - val mapper = new ObjectMapper - d + dataset .flatMap(i => CollectionUtils.fixRelations(i)) .filter(i => i != null) - .map(r => mapper.writeValueAsString(r))(Encoders.STRING) .write .mode(SaveMode.Overwrite) .option("compression", "gzip") - .text(targetPath) + .json(targetPath) } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala index d11c33fb4..a205edcf2 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala @@ -38,7 +38,7 @@ class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log val cleanedMdStoreVersion = mapper.readValue(mdstoreOutputVersion, classOf[MDStoreVersion]) val outputBasePath = cleanedMdStoreVersion.getHdfsPath log.info(s"outputBasePath is '$outputBasePath'") - val targetPath = s"$outputBasePath/$MDSTORE_DATA_PATH" + val targetPath = s"$outputBasePath$MDSTORE_DATA_PATH" log.info(s"targetPath is '$targetPath'") generateDataciteDataset(sourcePath, exportLinks, vocabularies, targetPath, spark) @@ -54,7 +54,7 @@ class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log * @param outputBasePath */ def reportTotalSize( targetPath: String, outputBasePath: String ):Unit = { - val total_items = spark.read.load(targetPath).count() + val total_items = spark.read.text(targetPath).count() writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$total_items", outputBasePath + MDSTORE_SIZE_PATH) } From 3bd3653be9e884679c390f8b9f9f36800b657692 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 4 Jan 2022 15:02:09 +0100 Subject: [PATCH 33/51] OAF-store-graph mdstores: save them in text format --- .../main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala index e7de60e07..86a28ac10 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala @@ -50,14 +50,16 @@ object CollectionUtils { def saveDataset(dataset: Dataset[Oaf], targetPath: String): Unit = { implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) + val mapper = new ObjectMapper dataset .flatMap(i => CollectionUtils.fixRelations(i)) .filter(i => i != null) + .map(r => mapper.writeValueAsString(r))(Encoders.STRING) .write .mode(SaveMode.Overwrite) .option("compression", "gzip") - .json(targetPath) + .text(targetPath) } } From 8ae46ca7896d88561417a34394dc502e6e9be6b6 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 5 Jan 2022 15:49:05 +0100 Subject: [PATCH 34/51] OAF-store-graph mdstores: firther fix for PR#180 --- .../raw/CopyHdfsOafSparkApplication.scala | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala index 1376c6b35..fa13f477c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala @@ -48,26 +48,27 @@ object CopyHdfsOafSparkApplication { log.info("hdfsPath: {}", hdfsPath) implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] - import spark.implicits._ val paths = DHPUtils.mdstorePaths(mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation, true).asScala val validPaths: List[String] = paths.filter(p => HdfsSupport.exists(p, sc.hadoopConfiguration)).toList + val types = ModelSupport.oafTypes.entrySet + .asScala + .map(e => Tuple2(e.getKey, e.getValue)) + if (validPaths.nonEmpty) { - val oaf = spark.read.load(validPaths: _*).as[String] + val oaf = spark.read.textFile(validPaths: _*) val mapper = new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) - val l = ModelSupport.oafTypes.entrySet.asScala.toList - l.foreach( - e => - oaf - .filter(o => isOafType(o, e.getKey)) - .map(j => mapper.readValue(j, e.getValue).asInstanceOf[Oaf]) - .map(s => mapper.writeValueAsString(s))(Encoders.STRING) - .write - .option("compression", "gzip") - .mode(SaveMode.Append) - .text(s"$hdfsPath/${e}") + + types.foreach(t => oaf + .filter(o => isOafType(o, t._1)) + .map(j => mapper.readValue(j, t._2).asInstanceOf[Oaf]) + .map(s => mapper.writeValueAsString(s))(Encoders.STRING) + .write + .option("compression", "gzip") + .mode(SaveMode.Append) + .text(s"$hdfsPath/${t._1}") ) } } From bb17e070d8de9fadfdff214a5869e360e0a3299c Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Thu, 23 Dec 2021 15:46:17 +0200 Subject: [PATCH 35/51] added result_result relations --- .../oa/graph/stats/oozie_app/scripts/step13.sql | 15 ++++++++++++++- .../oozie_app/scripts/step20-createMonitorDB.sql | 3 +++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql index e4e81175c..315d296fc 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql @@ -67,4 +67,17 @@ from ( LATERAL VIEW explode(author) a as auth LATERAL VIEW explode(auth.pid) ap as auth_pid LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type - WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res \ No newline at end of file + WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res; + +create table ${stats_db_name}.result_result stored as parquet as +select substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype +from ${openaire_db_name}.relation rel +join ${openaire_db_name}.result r1 on rel.source=r1.id +join ${openaire_db_name}.result r2 on r2.id=rel.target +where reltype='resultResult' + and r1.resulttype.classname!=r2.resulttype.classname + and r1.datainfo.deletedbyinference=false + and r2.datainfo.deletedbyinference=false + and r1.resulttype.classname != 'other' + and r2.resulttype.classname != 'other' + and rel.datainfo.deletedbyinference=false; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index fa8e4c6a7..ce7b4bd9c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -79,6 +79,9 @@ compute stats TARGET.result_sources; create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_topics; +create table TARGET.result_result stored as parquet as select * from SOURCE.result_result orig where exists (select 1 from TARGET.result r where r.id=orig.source or r.id=orig.target); +compute stats TARGET.result_result; + -- datasources create view if not exists TARGET.datasource as select * from SOURCE.datasource; create view if not exists TARGET.datasource_oids as select * from SOURCE.datasource_oids; From a4316bafedbc312de48096caa2e9e77b912b8224 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Thu, 23 Dec 2021 15:33:53 +0200 Subject: [PATCH 36/51] fixed a typo --- .../scripts/step20-createMonitorDB.sql | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index ce7b4bd9c..2d6ff41d4 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -49,8 +49,10 @@ compute stats TARGET.result_greenoa; create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_languages; -create table TARGET.result_licences stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_licences; +create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_licenses; + +create table TARGET.licenses_normalized as select * from SOURCE.licenses_normalized; create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_oids; @@ -79,9 +81,6 @@ compute stats TARGET.result_sources; create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_topics; -create table TARGET.result_result stored as parquet as select * from SOURCE.result_result orig where exists (select 1 from TARGET.result r where r.id=orig.source or r.id=orig.target); -compute stats TARGET.result_result; - -- datasources create view if not exists TARGET.datasource as select * from SOURCE.datasource; create view if not exists TARGET.datasource_oids as select * from SOURCE.datasource_oids; @@ -103,6 +102,7 @@ create view if not exists TARGET.project as select * from SOURCE.project; create view if not exists TARGET.project_oids as select * from SOURCE.project_oids; create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations; create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount; +create view if not exists TARGET.project_classification as select * from SOURCE.project_classification; create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects; compute stats TARGET.project_results; @@ -118,10 +118,10 @@ create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.i compute stats TARGET.indi_pub_gold_oa; create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_pub_has_abstract; -create table TARGET.indi_pub_has_cc_licence stored as parquet as select * from SOURCE.indi_pub_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_has_cc_licence; -create table TARGET.indi_pub_has_cc_licence_url stored as parquet as select * from SOURCE.indi_pub_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_has_cc_licence_url; +create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_result_has_cc_licence; +create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_result_has_cc_licence_url; create view TARGET.indi_funder_country_collab stored as select * from SOURCE.indi_funder_country_collab; @@ -138,6 +138,12 @@ compute stats TARGET.indi_pub_in_transformative; create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_pub_closed_other_open; +create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_result_no_of_copies; + +--- Usage statistics +create table TARGET.usage_stats stored as parquet as select * from SOURCE.usage_stats orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); + --denorm alter table TARGET.result rename to TARGET.res_tmp; From 4878d7485c1e1f79a3bdbb83eaed364c5ce4f576 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Wed, 22 Dec 2021 02:54:42 +0200 Subject: [PATCH 37/51] added usage stats --- .../oa/graph/stats/oozie_app/scripts/step10.sql | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql index fc0162a9c..13a4803a9 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql @@ -27,6 +27,22 @@ CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS SELECT * FROM ${external_stats_db_name}.licenses_normalized; +------------------------------------------------------------------------------------------------ +------------------------------------------------------------------------------------------------ +-- Usage statistics +------------------------------------------------------------------------------------------------ +------------------------------------------------------------------------------------------------ +create or replace view ${stats_db_name}.usage_stats as +select * from openaire_prod_usage_stats.usage_stats; + +create or replace view ${stats_db_name}.downloads_stats as +select * from openaire_prod_usage_stats.downloads_stats; + +create or replace view ${stats_db_name}.pageviews_stats as +select * from openaire_prod_usage_stats.pageviews_stats; + +create or replace view ${stats_db_name}.views_stats as +select * from openaire_prod_usage_stats.views_stats; ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ From 3ce1976627f5ea79d3b34dadcd5707aaa8da79c3 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Tue, 21 Dec 2021 22:55:04 +0200 Subject: [PATCH 38/51] fixed column names --- .../stats/oozie_app/scripts/step16-createIndicatorsTables.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 926c8825f..3b1a7eeaf 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -304,7 +304,7 @@ with tmp as (select funder, project, country from organization_projects op join organization o on o.id=op.id join project p on p.id=op.project where country <> 'UNKNOWN') -select f1.funder, f1.country, f2.country, count(distinct f1.project) as collaborations +select f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations from tmp as f1 join tmp as f2 on f1.project=f2.project where f1.country<>f2.country From e91f06f39bf38016c1d8565d53ff5e853afcc76c Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Tue, 21 Dec 2021 15:54:38 +0200 Subject: [PATCH 39/51] fixed typos in indicators. Added extra views in monitor --- .../scripts/step16-createIndicatorsTables.sql | 336 ++++-------------- .../scripts/step20-createMonitorDB.sql | 6 +- 2 files changed, 79 insertions(+), 263 deletions(-) mode change 100644 => 100755 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql old mode 100644 new mode 100755 index 3b1a7eeaf..bdd681f8d --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -1,28 +1,29 @@ +---- Sprint 1 ---- create table indi_pub_green_oa stored as parquet as -select distinct p.id, coalesce(green_oa, 0) as green_oa +select distinct p.id, coalesce(green_oa, 0) as green_oa from publication p -left outer join ( -select p.id, 1 as green_oa +left outer join ( +select p.id, 1 as green_oa from publication p join result_instance ri on ri.id = p.id join datasource on datasource.id = ri.hostedby where datasource.type like '%Repository%' -and (ri.accessright = 'Open Access' -or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp +and (ri.accessright = 'Open Access' +or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp on p.id= tmp.id; create table indi_pub_grey_lit stored as parquet as select distinct p.id, coalesce(grey_lit, 0) as grey_lit from publication p left outer join ( -select p.id, 1 as grey_lit +select p.id, 1 as grey_lit from publication p join result_classifications rt on rt.id = p.id -where rt.type not in ('Article','Part of book or chapter of book','Book','Doctoral thesis','Master thesis','Data Paper', 'Thesis', 'Bachelor thesis', 'Conference object') and +where rt.type not in ('Article','Part of book or chapter of book','Book','Doctoral thesis','Master thesis','Data Paper', 'Thesis', 'Bachelor thesis', 'Conference object') and not exists (select 1 from result_classifications rc where type ='Other literature type' and rc.id=p.id)) tmp on p.id=tmp.id; create table indi_pub_doi_from_crossref stored as parquet as -select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref +select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref from publication p left outer join (select ri.id, 1 as doi_from_crossref from result_instance ri @@ -33,7 +34,7 @@ on tmp.id=p.id; create table indi_pub_gold_oa stored as parquet as select distinct p.id, coalesce(gold_oa, 0) as gold_oa from publication p -left outer join ( +left outer join ( select p.id, 1 as gold_oa from publication p join result_instance ri on ri.id = p.id @@ -41,255 +42,56 @@ join datasource on datasource.id = ri.hostedby where datasource.id like '%doajarticles%') tmp on p.id= tmp.id; ---create table indi_project_pubs_count stored as parquet as ---select pr.id id, count(p.id) total_pubs from project_results pr ---join publication p on p.id=pr.result ---group by pr.id; - ---create table indi_project_datasets_count stored as parquet as ---select pr.id id, count(d.id) total_datasets from project_results pr ---join dataset d on d.id=pr.result ---group by pr.id; - ---create table indi_project_software_count stored as parquet as ---select pr.id id, count(s.id) total_software from project_results pr ---join software s on s.id=pr.result ---group by pr.id; - ---create table indi_project_otherresearch_count stored as parquet as ---select pr.id id, count(o.id) total_other from project_results pr ---join otherresearchproduct o on o.id=pr.result ---group by pr.id; - ---create table indi_pub_avg_year_country_oa stored as parquet as ---select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA, ---round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA - --from - --(SELECT year, country, SUM(CASE - --WHEN bestlicence='Open Access' THEN 1 - --ELSE 0 - --END) AS OpenAccess, SUM(CASE - --WHEN bestlicence<>'Open Access' THEN 1 - --ELSE 0 - --END) AS NonOpenAccess - --FROM publication p - --join result_organization ro on p.id=ro.id - --join organization o on o.id=ro.organization - --where cast(year as int)>=2003 and cast(year as int)<=2021 - --group by year, country) tmp; - ---create table indi_dataset_avg_year_country_oa stored as parquet as ---select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA, ---round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA ---from - --(SELECT year, country, SUM(CASE - --WHEN bestlicence='Open Access' THEN 1 - --ELSE 0 - --END) AS OpenAccess, SUM(CASE - --WHEN bestlicence<>'Open Access' THEN 1 - --ELSE 0 - --END) AS NonOpenAccess - --FROM dataset d - --join result_organization ro on d.id=ro.id - --join organization o on o.id=ro.organization - --where cast(year as int)>=2003 and cast(year as int)<=2021 - --group by year, country) tmp; - ---create table indi_software_avg_year_country_oa stored as parquet as ---select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA, ---round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA ---from --- (SELECT year, country, SUM(CASE - --WHEN bestlicence='Open Access' THEN 1 --- ELSE 0 ---END) AS OpenAccess, SUM(CASE --- WHEN bestlicence<>'Open Access' THEN 1 --- ELSE 0 --- END) AS NonOpenAccess --- FROM software s --- join result_organization ro on s.id=ro.id --- join organization o on o.id=ro.organization --- where cast(year as int)>=2003 and cast(year as int)<=2021 --- group by year, country) tmp; - - ---create table indi_other_avg_year_country_oa stored as parquet as ---select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA, ---round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA --- from --- (SELECT year, country, SUM(CASE --- WHEN bestlicence='Open Access' THEN 1 --- ELSE 0 --- END) AS OpenAccess, SUM(CASE --- WHEN bestlicence<>'Open Access' THEN 1 --- ELSE 0 --- END) AS NonOpenAccess --- FROM otherresearchproduct orp --- join result_organization ro on orp.id=ro.id --- join organization o on o.id=ro.organization --- where cast(year as int)>=2003 and cast(year as int)<=2021 --- group by year, country) tmp; - ---create table indi_pub_avg_year_context_oa stored as parquet as ---with total as ---(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from publication_concepts pc ---join context c on pc.concept like concat('%',c.id,'%') ---join publication p on p.id=pc.id ---where cast(year as int)>=2003 and cast(year as int)<=2021 ---group by c.name, year ) ---select year, name, round(no_of_pubs/total*100,3) averageofpubs ---from total; - ---create table indi_dataset_avg_year_context_oa stored as parquet as ---with total as ---(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from dataset_concepts pc ---join context c on pc.concept like concat('%',c.id,'%') ---join dataset p on p.id=pc.id ---where cast(year as int)>=2003 and cast(year as int)<=2021 ---group by c.name, year ) ---select year, name, round(no_of_pubs/total*100,3) averageofdataset ---from total; - ---create table indi_software_avg_year_context_oa stored as parquet as ---with total as ---(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from software_concepts pc ---join context c on pc.concept like concat('%',c.id,'%') ---join software p on p.id=pc.id ---where cast(year as int)>=2003 and cast(year as int)<=2021 ---group by c.name, year ) ---select year, name, round(no_of_pubs/total*100,3) averageofsoftware ---from total; - ---create table indi_other_avg_year_context_oa stored as parquet as ---with total as ---(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from otherresearchproduct_concepts pc ---join context c on pc.concept like concat('%',c.id,'%') ---join otherresearchproduct p on p.id=pc.id ---where cast(year as int)>=2003 and cast(year as int)<=2021 ---group by c.name, year ) ---select year, name, round(no_of_pubs/total*100,3) averageofother ---from total; - ---create table indi_other_avg_year_content_oa stored as parquet as ---with total as ---(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total ---from otherresearchproduct_datasources pd ---join datasource d on datasource=d.id ---join otherresearchproduct p on p.id=pd.id ---where cast(year as int)>=2003 and cast(year as int)<=2021 ---group by d.type, year) ---select year, type, round(no_of_pubs/total*100,3) averageOfOtherresearchproduct ---from total; - ---create table indi_software_avg_year_content_oa stored as parquet as ---with total as ---(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total ---from software_datasources pd ---join datasource d on datasource=d.id ---join software p on p.id=pd.id ---where cast(year as int)>=2003 and cast(year as int)<=2021 ---group by d.type, year) ---select year, type, round(no_of_pubs/total*100,3) averageOfSoftware ---from total; - ---create table indi_dataset_avg_year_content_oa stored as parquet as ---with total as ---(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total ---from dataset_datasources pd ---join datasource d on datasource=d.id ---join dataset p on p.id=pd.id ---where cast(year as int)>=2003 and cast(year as int)<=2021 ---group by d.type, year) ---select year, type, round(no_of_pubs/total*100,3) averageOfDatasets ---from total; - ---create table indi_pub_avg_year_content_oa stored as parquet as ---with total as ---(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total ---from publication_datasources pd ---join datasource d on datasource=d.id ---join publication p on p.id=pd.id ---where cast(year as int)>=2003 and cast(year as int)<=2021 ---group by d.type, year) ---select year, type, round(no_of_pubs/total*100,3) averageOfPubs ---from total; - -create table indi_pub_has_cc_licence stored as parquet as -select distinct p.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license -from publication p -left outer join (select p.id, license.type as lic from publication p -join publication_licenses as license on license.id = p.id +---- Sprint 2 ---- +create table indi_result_has_cc_licence stored as parquet as +select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license +from result r +left outer join (select r.id, license.type as lic from result r +join result_licenses as license on license.id = r.id where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp -on p.id= tmp.id; +on r.id= tmp.id; -create table indi_pub_has_cc_licence_url stored as parquet as -select distinct p.id, (case when lic_host='' or lic_host is null then 0 else 1 end) as has_cc_license_url -from publication p -left outer join (select p.id, lower(parse_url(license.type, "HOST")) as lic_host -from publication p -join publication_licenses as license on license.id = p.id -WHERE lower(parse_url(license.type, 'HOST')) = 'creativecommons.org') tmp -on p.id= tmp.id; - --- EOSC-TR1.1-02M: --- ## Indicator: has_cc_license. Creative Commons licensing has become a --- de facto standard in scholarly communication and is promoted by many initiatives --- like Plan S. This indicator might be only useful when applied --- to openly available publications. ---create table indi_pub_has_cc_licence_tr stored as parquet as ---select distinct p.id, case when lic='' or lic is null then 0 else 1 end as has_cc_license_tr ---from publication p ---left outer join (select p.id, license.type as lic from publication p ---join publication_licenses as license on license.id = p.id ---where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp ---on p.id= tmp.id - --- #EOSC-F2-01M_cc Rich metadata for scholarly publications --- ## Indicator: has_cc_license. Creative Commons licensing has become a --- de facto standard in scholarly communication and is promoted by many initiatives --- like Plan S. This indicator might be only useful when applied --- to openly available publications. - --- Same indicator as EOSC-TR1.1-02M (Najko's instructions) --- create table indi_pub_has_cc_licence_f stored as parquet as --- select --- distinct p.id, case when lic='' or lic is null then 0 else 1 end as has_cc_license_f --- from publication p --- left outer join (selectp.id,license.type as lic from publication p --- join publication_licenses as license on license.id = p.id --- where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp --- on p.id= tmp.id +create table indi_result_has_cc_licence_url stored as parquet as +select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url +from result r +left outer join (select r.id, lower(parse_url(license.type, "HOST")) as lic_host +from result r +join result_licenses as license on license.id = r.id +WHERE lower(parse_url(license.type, "HOST")) = "creativecommons.org") tmp +on r.id= tmp.id; create table indi_pub_has_abstract stored as parquet as select distinct publication.id, coalesce(abstract, 1) has_abstract from publication; -create table indi_result_with_orcid stored as parquet as +create table indi_result_with_orcid stored as parquet as select distinct r.id, coalesce(has_orcid, 0) as has_orcid -from result r -left outer join (select id, 1 as has_orcid from result_orcid) tmp -on r.id= tmp.id +from result r +left outer join (select id, 1 as has_orcid from result_orcid) tmp +on r.id= tmp.id; -create table indi_funded_result_with_fundref stored as parquet as + +---- Sprint 3 ---- +create table indi_funded_result_with_fundref stored as parquet as select distinct r.id, coalesce(fundref, 0) as fundref -from project_results r +from project_results r left outer join (select distinct id, 1 as fundref from project_results -where provenance='Harvested') tmp -on r.id= tmp.id +where provenance='Harvested') tmp +on r.id= tmp.id; -create table indi_result_org_country_collab stored as parquet as -with tmp as +create table indi_result_org_country_collab stored as parquet as +with tmp as (select o.id as id, o.country , ro.id as result,r.type from organization o join result_organization ro on o.id=ro.organization join result r on r.id=ro.id where o.country <> 'UNKNOWN') select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations from tmp as o1 join tmp as o2 on o1.result=o2.result -where o1.id<>o2.id and o1.country<>o2.country -group by o1.id, o1.type,o2.country +where o1.id<>o2.id and o1.country<>o2.country +group by o1.id, o1.type,o2.country; -create table indi_result_org_collab stored as parquet as -with tmp as +create table indi_result_org_collab stored as parquet as +with tmp as (select o.id, ro.id as result,r.type from organization o join result_organization ro on o.id=ro.organization join result r on r.id=ro.id) @@ -297,66 +99,82 @@ select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaboratio from tmp as o1 join tmp as o2 on o1.result=o2.result where o1.id<>o2.id -group by o1.id, o2.id, o1.type +group by o1.id, o2.id, o1.type; -create table indi_funder_country_collab stored as parquet as -with tmp as (select funder, project, country from organization_projects op -join organization o on o.id=op.id -join project p on p.id=op.project +create table indi_funder_country_collab stored as parquet as +with tmp as (select funder, project, country from organization_projects op +join organization o on o.id=op.id +join project p on p.id=op.project where country <> 'UNKNOWN') -select f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations +select f1.funder, f1.country, f2.country, count(distinct f1.project) as collaborations from tmp as f1 join tmp as f2 on f1.project=f2.project -where f1.country<>f2.country -group by f1.funder, f2.country, f1.country +where f1.country<>f2.country +group by f1.funder, f2.country, f1.country; +create table indi_result_country_collab stored as parquet as +with tmp as +(select country, ro.id as result,r.type from organization o +join result_organization ro on o.id=ro.organization +join result r on r.id=ro.id) +select o1.country country1, o2.country country2, o1.type, count(distinct o1.result) as collaborations +from tmp as o1 +join tmp as o2 on o1.result=o2.result +where o1.country<>o2.country +group by o1.country, o2.country, o1.type; + +---- Sprint 4 ---- create table indi_pub_diamond stored as parquet as -select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal +select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal from publication_datasources pd left outer join ( select pd.id, 1 as in_diamond_journal from publication_datasources pd join datasource d on d.id=pd.datasource join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp -on pd.id=tmp.id +on pd.id=tmp.id; create table indi_pub_hybrid stored as parquet as -select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid +select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid from publication_datasources pd left outer join ( select pd.id, 1 as is_hybrid from publication_datasources pd join datasource d on d.id=pd.datasource join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp -on pd.id=tmp.id +on pd.id=tmp.id; create table indi_is_gold_oa stored as parquet as -(select distinct pd.id, coalesce(gold_oa, 0) as gold_oa +(select distinct pd.id, coalesce(gold_oa, 0) as gold_oa from publication_datasources pd left outer join ( select pd.id, 1 as gold_oa from publication_datasources pd join datasource d on d.id=pd.datasource join stats_ext.plan_s_jn ps on (ps.issn_print=d.issn_printed or ps.issn_online=d.issn_online) where ps.journal_is_in_doaj is true or ps.journal_is_oa is true) tmp -on pd.id=tmp.id) - +on pd.id=tmp.id); create table indi_pub_in_transformative stored as parquet as -select distinct pd.id, coalesce(is_transformative, 0) as is_transformative +select distinct pd.id, coalesce(is_transformative, 0) as is_transformative from publication pd left outer join ( select pd.id, 1 as is_transformative from publication_datasources pd join datasource d on d.id=pd.datasource join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) and ps.is_transformative_journal=true) tmp -on pd.id=tmp.id +on pd.id=tmp.id; create table indi_pub_closed_other_open stored as parquet as select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from result_instance ri -left outer join +left outer join (select ri.id, 1 as pub_closed_other_open from result_instance ri join publication p on p.id=ri.id join datasource d on ri.hostedby=d.id where d.type like '%Journal%' and ri.accessright='Closed Access' and -(p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp -on tmp.id=ri.id \ No newline at end of file +(p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp +on tmp.id=ri.id; + + +---- Sprint 5 ---- +create table indi_result_no_of_copies stored as parquet as +select id, count(id) as number_of_copies from result_instance group by id; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 2d6ff41d4..9bd90aa1f 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -52,7 +52,8 @@ compute stats TARGET.result_languages; create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_licenses; -create table TARGET.licenses_normalized as select * from SOURCE.licenses_normalized; +create table TARGET.licenses_normalized stored as parquet as select * from SOURCE.licenses_normalized orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.licenses_normalized; create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_oids; @@ -141,9 +142,6 @@ compute stats TARGET.indi_pub_closed_other_open; create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_result_no_of_copies; ---- Usage statistics -create table TARGET.usage_stats stored as parquet as select * from SOURCE.usage_stats orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); - --denorm alter table TARGET.result rename to TARGET.res_tmp; From b6b4bc0df975508763e86d7cc868d5d81f6db176 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Tue, 21 Dec 2021 03:35:04 +0200 Subject: [PATCH 40/51] added first indicator of sprint 5 --- .../oozie_app/scripts/step20-createMonitorDB.sql | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 9bd90aa1f..1ae459640 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -49,11 +49,8 @@ compute stats TARGET.result_greenoa; create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_languages; -create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_licenses; - -create table TARGET.licenses_normalized stored as parquet as select * from SOURCE.licenses_normalized orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.licenses_normalized; +create table TARGET.result_licences stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_licences; create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_oids; @@ -103,7 +100,6 @@ create view if not exists TARGET.project as select * from SOURCE.project; create view if not exists TARGET.project_oids as select * from SOURCE.project_oids; create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations; create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount; -create view if not exists TARGET.project_classification as select * from SOURCE.project_classification; create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects; compute stats TARGET.project_results; @@ -119,8 +115,8 @@ create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.i compute stats TARGET.indi_pub_gold_oa; create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_pub_has_abstract; -create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_result_has_cc_licence; +create table TARGET.indi_result_has_cc_licence_f stored as parquet as select * from SOURCE.indi_result_has_cc_licence_f orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_result_has_cc_licence_f; create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_result_has_cc_licence_url; From b053b0178e0624c3416cb38eaa82014b99e64be2 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Mon, 20 Dec 2021 19:23:57 +0200 Subject: [PATCH 41/51] Sprint 5 and other changes --- .../scripts/step16-createIndicatorsTables.sql | 234 ++++++++++++++++-- 1 file changed, 219 insertions(+), 15 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index bdd681f8d..9c67da883 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -42,8 +42,181 @@ join datasource on datasource.id = ri.hostedby where datasource.id like '%doajarticles%') tmp on p.id= tmp.id; +--create table indi_project_pubs_count stored as parquet as +--select pr.id id, count(p.id) total_pubs from project_results pr +--join publication p on p.id=pr.result +--group by pr.id; + +--create table indi_project_datasets_count stored as parquet as +--select pr.id id, count(d.id) total_datasets from project_results pr +--join dataset d on d.id=pr.result +--group by pr.id; + +--create table indi_project_software_count stored as parquet as +--select pr.id id, count(s.id) total_software from project_results pr +--join software s on s.id=pr.result +--group by pr.id; + +--create table indi_project_otherresearch_count stored as parquet as +--select pr.id id, count(o.id) total_other from project_results pr +--join otherresearchproduct o on o.id=pr.result +--group by pr.id; + +--create table indi_pub_avg_year_country_oa stored as parquet as +--select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA, +--round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA + --from + --(SELECT year, country, SUM(CASE + --WHEN bestlicence='Open Access' THEN 1 + --ELSE 0 + --END) AS OpenAccess, SUM(CASE + --WHEN bestlicence<>'Open Access' THEN 1 + --ELSE 0 + --END) AS NonOpenAccess + --FROM publication p + --join result_organization ro on p.id=ro.id + --join organization o on o.id=ro.organization + --where cast(year as int)>=2003 and cast(year as int)<=2021 + --group by year, country) tmp; + +--create table indi_dataset_avg_year_country_oa stored as parquet as +--select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA, +--round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA +--from + --(SELECT year, country, SUM(CASE + --WHEN bestlicence='Open Access' THEN 1 + --ELSE 0 + --END) AS OpenAccess, SUM(CASE + --WHEN bestlicence<>'Open Access' THEN 1 + --ELSE 0 + --END) AS NonOpenAccess + --FROM dataset d + --join result_organization ro on d.id=ro.id + --join organization o on o.id=ro.organization + --where cast(year as int)>=2003 and cast(year as int)<=2021 + --group by year, country) tmp; + +--create table indi_software_avg_year_country_oa stored as parquet as +--select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA, +--round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA +--from +-- (SELECT year, country, SUM(CASE + --WHEN bestlicence='Open Access' THEN 1 +-- ELSE 0 +--END) AS OpenAccess, SUM(CASE +-- WHEN bestlicence<>'Open Access' THEN 1 +-- ELSE 0 +-- END) AS NonOpenAccess +-- FROM software s +-- join result_organization ro on s.id=ro.id +-- join organization o on o.id=ro.organization +-- where cast(year as int)>=2003 and cast(year as int)<=2021 +-- group by year, country) tmp; + + +--create table indi_other_avg_year_country_oa stored as parquet as +--select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA, +--round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA +-- from +-- (SELECT year, country, SUM(CASE +-- WHEN bestlicence='Open Access' THEN 1 +-- ELSE 0 +-- END) AS OpenAccess, SUM(CASE +-- WHEN bestlicence<>'Open Access' THEN 1 +-- ELSE 0 +-- END) AS NonOpenAccess +-- FROM otherresearchproduct orp +-- join result_organization ro on orp.id=ro.id +-- join organization o on o.id=ro.organization +-- where cast(year as int)>=2003 and cast(year as int)<=2021 +-- group by year, country) tmp; + +--create table indi_pub_avg_year_context_oa stored as parquet as +--with total as +--(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from publication_concepts pc +--join context c on pc.concept like concat('%',c.id,'%') +--join publication p on p.id=pc.id +--where cast(year as int)>=2003 and cast(year as int)<=2021 +--group by c.name, year ) +--select year, name, round(no_of_pubs/total*100,3) averageofpubs +--from total; + +--create table indi_dataset_avg_year_context_oa stored as parquet as +--with total as +--(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from dataset_concepts pc +--join context c on pc.concept like concat('%',c.id,'%') +--join dataset p on p.id=pc.id +--where cast(year as int)>=2003 and cast(year as int)<=2021 +--group by c.name, year ) +--select year, name, round(no_of_pubs/total*100,3) averageofdataset +--from total; + +--create table indi_software_avg_year_context_oa stored as parquet as +--with total as +--(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from software_concepts pc +--join context c on pc.concept like concat('%',c.id,'%') +--join software p on p.id=pc.id +--where cast(year as int)>=2003 and cast(year as int)<=2021 +--group by c.name, year ) +--select year, name, round(no_of_pubs/total*100,3) averageofsoftware +--from total; + +--create table indi_other_avg_year_context_oa stored as parquet as +--with total as +--(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from otherresearchproduct_concepts pc +--join context c on pc.concept like concat('%',c.id,'%') +--join otherresearchproduct p on p.id=pc.id +--where cast(year as int)>=2003 and cast(year as int)<=2021 +--group by c.name, year ) +--select year, name, round(no_of_pubs/total*100,3) averageofother +--from total; + +--create table indi_other_avg_year_content_oa stored as parquet as +--with total as +--(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total +--from otherresearchproduct_datasources pd +--join datasource d on datasource=d.id +--join otherresearchproduct p on p.id=pd.id +--where cast(year as int)>=2003 and cast(year as int)<=2021 +--group by d.type, year) +--select year, type, round(no_of_pubs/total*100,3) averageOfOtherresearchproduct +--from total; + +--create table indi_software_avg_year_content_oa stored as parquet as +--with total as +--(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total +--from software_datasources pd +--join datasource d on datasource=d.id +--join software p on p.id=pd.id +--where cast(year as int)>=2003 and cast(year as int)<=2021 +--group by d.type, year) +--select year, type, round(no_of_pubs/total*100,3) averageOfSoftware +--from total; + +--create table indi_dataset_avg_year_content_oa stored as parquet as +--with total as +--(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total +--from dataset_datasources pd +--join datasource d on datasource=d.id +--join dataset p on p.id=pd.id +--where cast(year as int)>=2003 and cast(year as int)<=2021 +--group by d.type, year) +--select year, type, round(no_of_pubs/total*100,3) averageOfDatasets +--from total; + +--create table indi_pub_avg_year_content_oa stored as parquet as +--with total as +--(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total +--from publication_datasources pd +--join datasource d on datasource=d.id +--join publication p on p.id=pd.id +--where cast(year as int)>=2003 and cast(year as int)<=2021 +--group by d.type, year) +--select year, type, round(no_of_pubs/total*100,3) averageOfPubs +--from total; + ---- Sprint 2 ---- -create table indi_result_has_cc_licence stored as parquet as +create table indi_result_has_cc_licence_f stored as parquet as select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license from result r left outer join (select r.id, license.type as lic from result r @@ -60,6 +233,35 @@ join result_licenses as license on license.id = r.id WHERE lower(parse_url(license.type, "HOST")) = "creativecommons.org") tmp on r.id= tmp.id; +-- EOSC-TR1.1-02M: +-- ## Indicator: has_cc_license. Creative Commons licensing has become a +-- de facto standard in scholarly communication and is promoted by many initiatives +-- like Plan S. This indicator might be only useful when applied +-- to openly available publications. +--create table indi_pub_has_cc_licence_tr stored as parquet as +--select distinct p.id, case when lic='' or lic is null then 0 else 1 end indi_result_org_collabas has_cc_license_tr +--from publication p +--left outer join (select p.id, license.type as lic from publication p +--join publication_licenses as license on license.id = p.id +--where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp +--on p.id= tmp.id + +-- #EOSC-F2-01M_cc Rich metadata for scholarly publications +-- ## Indicator: has_cc_license. Creative Commons licensing has become a +-- de facto standard in scholarly communication and is promoted by many initiatives +-- like Plan S. This indicator might be only useful when applied +-- to openly available publications. + +-- Same indicator as EOSC-TR1.1-02M (Najko's instructions) +-- create table indi_pub_has_cc_licence_f stored as parquet as +-- select +-- distinct p.id, case when lic='' or lic is null then 0 else 1 end as has_cc_license_f +-- from publication p +-- left outer join (selectp.id,license.type as lic from publication p +-- join publication_licenses as license on license.id = p.id +-- where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp +-- on p.id= tmp.id + create table indi_pub_has_abstract stored as parquet as select distinct publication.id, coalesce(abstract, 1) has_abstract from publication; @@ -68,16 +270,16 @@ create table indi_result_with_orcid stored as parquet as select distinct r.id, coalesce(has_orcid, 0) as has_orcid from result r left outer join (select id, 1 as has_orcid from result_orcid) tmp -on r.id= tmp.id; - +on r.id= tmp.id ---- Sprint 3 ---- + create table indi_funded_result_with_fundref stored as parquet as select distinct r.id, coalesce(fundref, 0) as fundref from project_results r left outer join (select distinct id, 1 as fundref from project_results where provenance='Harvested') tmp -on r.id= tmp.id; +on r.id= tmp.id create table indi_result_org_country_collab stored as parquet as with tmp as @@ -88,7 +290,7 @@ select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as col from tmp as o1 join tmp as o2 on o1.result=o2.result where o1.id<>o2.id and o1.country<>o2.country -group by o1.id, o1.type,o2.country; +group by o1.id, o1.type,o2.country create table indi_result_org_collab stored as parquet as with tmp as @@ -99,7 +301,7 @@ select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaboratio from tmp as o1 join tmp as o2 on o1.result=o2.result where o1.id<>o2.id -group by o1.id, o2.id, o1.type; +group by o1.id, o2.id, o1.type create table indi_funder_country_collab stored as parquet as with tmp as (select funder, project, country from organization_projects op @@ -110,7 +312,7 @@ select f1.funder, f1.country, f2.country, count(distinct f1.project) as collabor from tmp as f1 join tmp as f2 on f1.project=f2.project where f1.country<>f2.country -group by f1.funder, f2.country, f1.country; +group by f1.funder, f2.country, f1.country create table indi_result_country_collab stored as parquet as with tmp as @@ -121,9 +323,10 @@ select o1.country country1, o2.country country2, o1.type, count(distinct o1.resu from tmp as o1 join tmp as o2 on o1.result=o2.result where o1.country<>o2.country -group by o1.country, o2.country, o1.type; +group by o1.country, o2.country, o1.type ---- Sprint 4 ---- + create table indi_pub_diamond stored as parquet as select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal from publication_datasources pd @@ -132,7 +335,7 @@ select pd.id, 1 as in_diamond_journal from publication_datasources pd join datasource d on d.id=pd.datasource join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp -on pd.id=tmp.id; +on pd.id=tmp.id create table indi_pub_hybrid stored as parquet as select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid @@ -142,7 +345,7 @@ select pd.id, 1 as is_hybrid from publication_datasources pd join datasource d on d.id=pd.datasource join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp -on pd.id=tmp.id; +on pd.id=tmp.id create table indi_is_gold_oa stored as parquet as (select distinct pd.id, coalesce(gold_oa, 0) as gold_oa @@ -152,7 +355,8 @@ select pd.id, 1 as gold_oa from publication_datasources pd join datasource d on d.id=pd.datasource join stats_ext.plan_s_jn ps on (ps.issn_print=d.issn_printed or ps.issn_online=d.issn_online) where ps.journal_is_in_doaj is true or ps.journal_is_oa is true) tmp -on pd.id=tmp.id); +on pd.id=tmp.id) + create table indi_pub_in_transformative stored as parquet as select distinct pd.id, coalesce(is_transformative, 0) as is_transformative @@ -162,7 +366,7 @@ select pd.id, 1 as is_transformative from publication_datasources pd join datasource d on d.id=pd.datasource join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) and ps.is_transformative_journal=true) tmp -on pd.id=tmp.id; +on pd.id=tmp.id create table indi_pub_closed_other_open stored as parquet as select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from result_instance ri @@ -172,9 +376,9 @@ join publication p on p.id=ri.id join datasource d on ri.hostedby=d.id where d.type like '%Journal%' and ri.accessright='Closed Access' and (p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp -on tmp.id=ri.id; - +on tmp.id=ri.id ---- Sprint 5 ---- + create table indi_result_no_of_copies stored as parquet as -select id, count(id) as number_of_copies from result_instance group by id; \ No newline at end of file +select id, count(id) as number_of_copies from result_instance group by id \ No newline at end of file From 9acc32faa69ae1d6dae8bc2f9a0e941d130a5fa3 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 12 Jan 2022 12:04:31 +0100 Subject: [PATCH 42/51] [stats wf] final touches for the integration of PRs #166, #179 in the master branch --- .../scripts/step16-createIndicatorsTables.sql | 236 ++---------------- .../scripts/step20-createMonitorDB.sql | 17 +- 2 files changed, 29 insertions(+), 224 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 9c67da883..9f11fa49d 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -42,181 +42,8 @@ join datasource on datasource.id = ri.hostedby where datasource.id like '%doajarticles%') tmp on p.id= tmp.id; ---create table indi_project_pubs_count stored as parquet as ---select pr.id id, count(p.id) total_pubs from project_results pr ---join publication p on p.id=pr.result ---group by pr.id; - ---create table indi_project_datasets_count stored as parquet as ---select pr.id id, count(d.id) total_datasets from project_results pr ---join dataset d on d.id=pr.result ---group by pr.id; - ---create table indi_project_software_count stored as parquet as ---select pr.id id, count(s.id) total_software from project_results pr ---join software s on s.id=pr.result ---group by pr.id; - ---create table indi_project_otherresearch_count stored as parquet as ---select pr.id id, count(o.id) total_other from project_results pr ---join otherresearchproduct o on o.id=pr.result ---group by pr.id; - ---create table indi_pub_avg_year_country_oa stored as parquet as ---select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA, ---round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA - --from - --(SELECT year, country, SUM(CASE - --WHEN bestlicence='Open Access' THEN 1 - --ELSE 0 - --END) AS OpenAccess, SUM(CASE - --WHEN bestlicence<>'Open Access' THEN 1 - --ELSE 0 - --END) AS NonOpenAccess - --FROM publication p - --join result_organization ro on p.id=ro.id - --join organization o on o.id=ro.organization - --where cast(year as int)>=2003 and cast(year as int)<=2021 - --group by year, country) tmp; - ---create table indi_dataset_avg_year_country_oa stored as parquet as ---select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA, ---round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA ---from - --(SELECT year, country, SUM(CASE - --WHEN bestlicence='Open Access' THEN 1 - --ELSE 0 - --END) AS OpenAccess, SUM(CASE - --WHEN bestlicence<>'Open Access' THEN 1 - --ELSE 0 - --END) AS NonOpenAccess - --FROM dataset d - --join result_organization ro on d.id=ro.id - --join organization o on o.id=ro.organization - --where cast(year as int)>=2003 and cast(year as int)<=2021 - --group by year, country) tmp; - ---create table indi_software_avg_year_country_oa stored as parquet as ---select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA, ---round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA ---from --- (SELECT year, country, SUM(CASE - --WHEN bestlicence='Open Access' THEN 1 --- ELSE 0 ---END) AS OpenAccess, SUM(CASE --- WHEN bestlicence<>'Open Access' THEN 1 --- ELSE 0 --- END) AS NonOpenAccess --- FROM software s --- join result_organization ro on s.id=ro.id --- join organization o on o.id=ro.organization --- where cast(year as int)>=2003 and cast(year as int)<=2021 --- group by year, country) tmp; - - ---create table indi_other_avg_year_country_oa stored as parquet as ---select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA, ---round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA --- from --- (SELECT year, country, SUM(CASE --- WHEN bestlicence='Open Access' THEN 1 --- ELSE 0 --- END) AS OpenAccess, SUM(CASE --- WHEN bestlicence<>'Open Access' THEN 1 --- ELSE 0 --- END) AS NonOpenAccess --- FROM otherresearchproduct orp --- join result_organization ro on orp.id=ro.id --- join organization o on o.id=ro.organization --- where cast(year as int)>=2003 and cast(year as int)<=2021 --- group by year, country) tmp; - ---create table indi_pub_avg_year_context_oa stored as parquet as ---with total as ---(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from publication_concepts pc ---join context c on pc.concept like concat('%',c.id,'%') ---join publication p on p.id=pc.id ---where cast(year as int)>=2003 and cast(year as int)<=2021 ---group by c.name, year ) ---select year, name, round(no_of_pubs/total*100,3) averageofpubs ---from total; - ---create table indi_dataset_avg_year_context_oa stored as parquet as ---with total as ---(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from dataset_concepts pc ---join context c on pc.concept like concat('%',c.id,'%') ---join dataset p on p.id=pc.id ---where cast(year as int)>=2003 and cast(year as int)<=2021 ---group by c.name, year ) ---select year, name, round(no_of_pubs/total*100,3) averageofdataset ---from total; - ---create table indi_software_avg_year_context_oa stored as parquet as ---with total as ---(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from software_concepts pc ---join context c on pc.concept like concat('%',c.id,'%') ---join software p on p.id=pc.id ---where cast(year as int)>=2003 and cast(year as int)<=2021 ---group by c.name, year ) ---select year, name, round(no_of_pubs/total*100,3) averageofsoftware ---from total; - ---create table indi_other_avg_year_context_oa stored as parquet as ---with total as ---(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from otherresearchproduct_concepts pc ---join context c on pc.concept like concat('%',c.id,'%') ---join otherresearchproduct p on p.id=pc.id ---where cast(year as int)>=2003 and cast(year as int)<=2021 ---group by c.name, year ) ---select year, name, round(no_of_pubs/total*100,3) averageofother ---from total; - ---create table indi_other_avg_year_content_oa stored as parquet as ---with total as ---(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total ---from otherresearchproduct_datasources pd ---join datasource d on datasource=d.id ---join otherresearchproduct p on p.id=pd.id ---where cast(year as int)>=2003 and cast(year as int)<=2021 ---group by d.type, year) ---select year, type, round(no_of_pubs/total*100,3) averageOfOtherresearchproduct ---from total; - ---create table indi_software_avg_year_content_oa stored as parquet as ---with total as ---(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total ---from software_datasources pd ---join datasource d on datasource=d.id ---join software p on p.id=pd.id ---where cast(year as int)>=2003 and cast(year as int)<=2021 ---group by d.type, year) ---select year, type, round(no_of_pubs/total*100,3) averageOfSoftware ---from total; - ---create table indi_dataset_avg_year_content_oa stored as parquet as ---with total as ---(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total ---from dataset_datasources pd ---join datasource d on datasource=d.id ---join dataset p on p.id=pd.id ---where cast(year as int)>=2003 and cast(year as int)<=2021 ---group by d.type, year) ---select year, type, round(no_of_pubs/total*100,3) averageOfDatasets ---from total; - ---create table indi_pub_avg_year_content_oa stored as parquet as ---with total as ---(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total ---from publication_datasources pd ---join datasource d on datasource=d.id ---join publication p on p.id=pd.id ---where cast(year as int)>=2003 and cast(year as int)<=2021 ---group by d.type, year) ---select year, type, round(no_of_pubs/total*100,3) averageOfPubs ---from total; - ---- Sprint 2 ---- -create table indi_result_has_cc_licence_f stored as parquet as +create table indi_result_has_cc_licence stored as parquet as select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license from result r left outer join (select r.id, license.type as lic from result r @@ -233,35 +60,6 @@ join result_licenses as license on license.id = r.id WHERE lower(parse_url(license.type, "HOST")) = "creativecommons.org") tmp on r.id= tmp.id; --- EOSC-TR1.1-02M: --- ## Indicator: has_cc_license. Creative Commons licensing has become a --- de facto standard in scholarly communication and is promoted by many initiatives --- like Plan S. This indicator might be only useful when applied --- to openly available publications. ---create table indi_pub_has_cc_licence_tr stored as parquet as ---select distinct p.id, case when lic='' or lic is null then 0 else 1 end indi_result_org_collabas has_cc_license_tr ---from publication p ---left outer join (select p.id, license.type as lic from publication p ---join publication_licenses as license on license.id = p.id ---where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp ---on p.id= tmp.id - --- #EOSC-F2-01M_cc Rich metadata for scholarly publications --- ## Indicator: has_cc_license. Creative Commons licensing has become a --- de facto standard in scholarly communication and is promoted by many initiatives --- like Plan S. This indicator might be only useful when applied --- to openly available publications. - --- Same indicator as EOSC-TR1.1-02M (Najko's instructions) --- create table indi_pub_has_cc_licence_f stored as parquet as --- select --- distinct p.id, case when lic='' or lic is null then 0 else 1 end as has_cc_license_f --- from publication p --- left outer join (selectp.id,license.type as lic from publication p --- join publication_licenses as license on license.id = p.id --- where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp --- on p.id= tmp.id - create table indi_pub_has_abstract stored as parquet as select distinct publication.id, coalesce(abstract, 1) has_abstract from publication; @@ -270,16 +68,16 @@ create table indi_result_with_orcid stored as parquet as select distinct r.id, coalesce(has_orcid, 0) as has_orcid from result r left outer join (select id, 1 as has_orcid from result_orcid) tmp -on r.id= tmp.id +on r.id= tmp.id; + ---- Sprint 3 ---- - create table indi_funded_result_with_fundref stored as parquet as select distinct r.id, coalesce(fundref, 0) as fundref from project_results r left outer join (select distinct id, 1 as fundref from project_results where provenance='Harvested') tmp -on r.id= tmp.id +on r.id= tmp.id; create table indi_result_org_country_collab stored as parquet as with tmp as @@ -290,7 +88,7 @@ select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as col from tmp as o1 join tmp as o2 on o1.result=o2.result where o1.id<>o2.id and o1.country<>o2.country -group by o1.id, o1.type,o2.country +group by o1.id, o1.type,o2.country; create table indi_result_org_collab stored as parquet as with tmp as @@ -301,18 +99,18 @@ select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaboratio from tmp as o1 join tmp as o2 on o1.result=o2.result where o1.id<>o2.id -group by o1.id, o2.id, o1.type +group by o1.id, o2.id, o1.type; create table indi_funder_country_collab stored as parquet as with tmp as (select funder, project, country from organization_projects op join organization o on o.id=op.id join project p on p.id=op.project where country <> 'UNKNOWN') -select f1.funder, f1.country, f2.country, count(distinct f1.project) as collaborations +select f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations from tmp as f1 join tmp as f2 on f1.project=f2.project where f1.country<>f2.country -group by f1.funder, f2.country, f1.country +group by f1.funder, f2.country, f1.country; create table indi_result_country_collab stored as parquet as with tmp as @@ -323,10 +121,9 @@ select o1.country country1, o2.country country2, o1.type, count(distinct o1.resu from tmp as o1 join tmp as o2 on o1.result=o2.result where o1.country<>o2.country -group by o1.country, o2.country, o1.type +group by o1.country, o2.country, o1.type; ---- Sprint 4 ---- - create table indi_pub_diamond stored as parquet as select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal from publication_datasources pd @@ -335,7 +132,7 @@ select pd.id, 1 as in_diamond_journal from publication_datasources pd join datasource d on d.id=pd.datasource join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp -on pd.id=tmp.id +on pd.id=tmp.id; create table indi_pub_hybrid stored as parquet as select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid @@ -345,7 +142,7 @@ select pd.id, 1 as is_hybrid from publication_datasources pd join datasource d on d.id=pd.datasource join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp -on pd.id=tmp.id +on pd.id=tmp.id; create table indi_is_gold_oa stored as parquet as (select distinct pd.id, coalesce(gold_oa, 0) as gold_oa @@ -355,8 +152,7 @@ select pd.id, 1 as gold_oa from publication_datasources pd join datasource d on d.id=pd.datasource join stats_ext.plan_s_jn ps on (ps.issn_print=d.issn_printed or ps.issn_online=d.issn_online) where ps.journal_is_in_doaj is true or ps.journal_is_oa is true) tmp -on pd.id=tmp.id) - +on pd.id=tmp.id); create table indi_pub_in_transformative stored as parquet as select distinct pd.id, coalesce(is_transformative, 0) as is_transformative @@ -366,7 +162,7 @@ select pd.id, 1 as is_transformative from publication_datasources pd join datasource d on d.id=pd.datasource join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) and ps.is_transformative_journal=true) tmp -on pd.id=tmp.id +on pd.id=tmp.id; create table indi_pub_closed_other_open stored as parquet as select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from result_instance ri @@ -376,9 +172,9 @@ join publication p on p.id=ri.id join datasource d on ri.hostedby=d.id where d.type like '%Journal%' and ri.accessright='Closed Access' and (p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp -on tmp.id=ri.id +on tmp.id=ri.id; + ---- Sprint 5 ---- - create table indi_result_no_of_copies stored as parquet as -select id, count(id) as number_of_copies from result_instance group by id \ No newline at end of file +select id, count(id) as number_of_copies from result_instance group by id; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 1ae459640..e72378f56 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -49,8 +49,10 @@ compute stats TARGET.result_greenoa; create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_languages; -create table TARGET.result_licences stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_licences; +create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_licenses; + +create table TARGET.licenses_normalized as select * from SOURCE.licenses_normalized; create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_oids; @@ -79,6 +81,9 @@ compute stats TARGET.result_sources; create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_topics; +create table TARGET.result_result stored as parquet as select * from SOURCE.result_result orig where exists (select 1 from TARGET.result r where r.id=orig.source or r.id=orig.target); +compute stats TARGET.result_result; + -- datasources create view if not exists TARGET.datasource as select * from SOURCE.datasource; create view if not exists TARGET.datasource_oids as select * from SOURCE.datasource_oids; @@ -100,6 +105,7 @@ create view if not exists TARGET.project as select * from SOURCE.project; create view if not exists TARGET.project_oids as select * from SOURCE.project_oids; create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations; create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount; +create view if not exists TARGET.project_classification as select * from SOURCE.project_classification; create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects; compute stats TARGET.project_results; @@ -115,8 +121,8 @@ create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.i compute stats TARGET.indi_pub_gold_oa; create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_pub_has_abstract; -create table TARGET.indi_result_has_cc_licence_f stored as parquet as select * from SOURCE.indi_result_has_cc_licence_f orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_result_has_cc_licence_f; +create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_result_has_cc_licence; create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_result_has_cc_licence_url; @@ -138,6 +144,9 @@ compute stats TARGET.indi_pub_closed_other_open; create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_result_no_of_copies; +--- Usage statistics +create table TARGET.usage_stats stored as parquet as select * from SOURCE.usage_stats orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); + --denorm alter table TARGET.result rename to TARGET.res_tmp; From f2fde5566bc9875731911a3717210c691fbbc8e1 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 20 Jan 2022 09:19:07 +0100 Subject: [PATCH 43/51] using helper method from ModelSupport to find the inverse relation descriptor --- .../dhp/blacklist/ReadBlacklistFromDB.java | 2 +- .../dhp/blacklist/BlacklistRelationTest.java | 38 +++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 dhp-workflows/dhp-blacklist/src/test/java/eu/dnetlib/dhp/blacklist/BlacklistRelationTest.java diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/ReadBlacklistFromDB.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/ReadBlacklistFromDB.java index 7d0d6b0b8..380991526 100644 --- a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/ReadBlacklistFromDB.java +++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/ReadBlacklistFromDB.java @@ -90,7 +90,7 @@ public class ReadBlacklistFromDB implements Closeable { inverse.setSource(target_direct); String encoding = rs.getString("relationship"); - RelationInverse ri = ModelSupport.relationInverseMap.get(encoding); + RelationInverse ri = ModelSupport.findInverse(encoding); direct.setRelClass(ri.getRelClass()); inverse.setRelClass(ri.getInverseRelClass()); direct.setRelType(ri.getRelType()); diff --git a/dhp-workflows/dhp-blacklist/src/test/java/eu/dnetlib/dhp/blacklist/BlacklistRelationTest.java b/dhp-workflows/dhp-blacklist/src/test/java/eu/dnetlib/dhp/blacklist/BlacklistRelationTest.java new file mode 100644 index 000000000..160658e5b --- /dev/null +++ b/dhp-workflows/dhp-blacklist/src/test/java/eu/dnetlib/dhp/blacklist/BlacklistRelationTest.java @@ -0,0 +1,38 @@ + +package eu.dnetlib.dhp.blacklist; + +import java.util.Arrays; +import java.util.List; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.common.RelationInverse; + +public class BlacklistRelationTest { + + @Test + public void testRelationInverseLookup() { + + final List rels = Arrays + .asList( + "resultResult_relationship_IsRelatedTo", + "resultOrganization_affiliation_isAuthorInstitutionOf", + "resultOrganization_affiliation_hasAuthorInstitution", + "datasourceOrganization_provision_isProvidedBy", + "projectOrganization_participation_hasParticipant", + "resultProject_outcome_produces", + "resultProject_outcome_isProducedBy"); + + rels.forEach(r -> { + RelationInverse inverse = ModelSupport.relationInverseMap.get(r); + Assertions.assertNotNull(inverse); + Assertions.assertNotNull(inverse.getRelType()); + Assertions.assertNotNull(inverse.getSubReltype()); + Assertions.assertNotNull(inverse.getRelClass()); + }); + + } + +} From 8de97883083bf2608a084840ab1f3e03f4244e05 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 24 Jan 2022 11:29:22 +0100 Subject: [PATCH 44/51] applied fix for avoiding ruling out the invisible (APC) records during the graph cleaning --- .../oaf/utils/GraphCleaningFunctions.java | 16 + .../clean/GraphCleaningFunctionsTest.java | 47 + .../dhp/oa/graph/clean/result_invisible.json | 958 ++++++++++++++++++ .../graph/clean/result_missing_invisible.json | 922 +++++++++++++++++ 4 files changed, 1943 insertions(+) create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result_invisible.json create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result_missing_invisible.json diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index f49a47c0a..48e35a342 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -88,6 +88,22 @@ public class GraphCleaningFunctions extends CleaningFunctions { } public static boolean filter(T value) { + if (Boolean.TRUE + .equals( + Optional + .ofNullable(value) + .map( + o -> Optional + .ofNullable(o.getDataInfo()) + .map( + d -> Optional + .ofNullable(d.getInvisible()) + .orElse(true)) + .orElse(true)) + .orElse(true))) { + return true; + } + if (value instanceof Datasource) { // nothing to evaluate here } else if (value instanceof Project) { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java index 0e9efed21..ebd8cb55c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java @@ -71,6 +71,53 @@ public class GraphCleaningFunctionsTest { } } + @Test + void testFilter_invisible_true() throws Exception { + + assertNotNull(vocabularies); + assertNotNull(mapping); + + String json = IOUtils + .toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result_invisible.json")); + Publication p_in = MAPPER.readValue(json, Publication.class); + + assertTrue(p_in instanceof Result); + assertTrue(p_in instanceof Publication); + + assertEquals(true, GraphCleaningFunctions.filter(p_in)); + } + + @Test + void testFilter_true_nothing_to_filter() throws Exception { + + assertNotNull(vocabularies); + assertNotNull(mapping); + + String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json")); + Publication p_in = MAPPER.readValue(json, Publication.class); + + assertTrue(p_in instanceof Result); + assertTrue(p_in instanceof Publication); + + assertEquals(true, GraphCleaningFunctions.filter(p_in)); + } + + @Test + void testFilter_missing_invisible() throws Exception { + + assertNotNull(vocabularies); + assertNotNull(mapping); + + String json = IOUtils + .toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result_missing_invisible.json")); + Publication p_in = MAPPER.readValue(json, Publication.class); + + assertTrue(p_in instanceof Result); + assertTrue(p_in instanceof Publication); + + assertEquals(true, GraphCleaningFunctions.filter(p_in)); + } + @Test void testCleaning() throws Exception { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result_invisible.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result_invisible.json new file mode 100644 index 000000000..ffcb187c1 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result_invisible.json @@ -0,0 +1,958 @@ +{ + "author": [ + { + "affiliation": [ + ], + "fullname": "Brien, Tom", + "name": "Tom", + "pid": [ + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "ORCID12", + "classname": "ORCID12", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "0000-0001-9613-6639" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "ORCID12", + "classname": "ORCID12", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "https://orcid.org/0000-0001-9613-6639" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "sysimport:crosswalk:entityregistry", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "orcid", + "classname": "ORCID12", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "0000-0001-9613-6639" + } + ], + "rank": 1, + "surname": "Brien" + }, + { + "affiliation": [ + ], + "fullname": "Ade, Peter", + "name": "Peter", + "pid": [ + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "xyz", + "classname": "XYZ", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "qwerty" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "ORCID", + "classname": "ORCID", + "schemeid": "", + "schemename": "" + }, + "value": "asdasd" + } + ], + "rank": 2, + "surname": "Ade" + }, + { + "affiliation": [ + ], + "fullname": "Barry, Peter S.", + "name": "Peter S.", + "pid": null, + "rank": 3, + "surname": "Barry" + }, + { + "affiliation": [ + ], + "fullname": "Dunscombe, Chris J.", + "name": "Chris J.", + "pid": [ + ], + "rank": 4, + "surname": "Dunscombe" + }, + { + "affiliation": [ + ], + "fullname": "Leadley, David R.", + "name": "David R.", + "pid": [ + ], + "rank": 5, + "surname": "Leadley" + }, + { + "affiliation": [ + ], + "fullname": "Morozov, Dmitry V.", + "name": "Dmitry V.", + "pid": [ + ], + "rank": 6, + "surname": "Morozov" + }, + { + "affiliation": [ + ], + "fullname": "Myronov, Maksym", + "name": "Maksym", + "pid": [ + ], + "rank": 7, + "surname": "Myronov" + }, + { + "affiliation": [ + ], + "fullname": "Parker, Evan", + "name": "Evan", + "pid": [ + ], + "rank": 8, + "surname": "Parker" + }, + { + "affiliation": [ + ], + "fullname": "Prest, Martin J.", + "name": "Martin J.", + "pid": [ + ], + "rank": 9, + "surname": "Prest" + }, + { + "affiliation": [ + ], + "fullname": "Prunnila, Mika", + "name": "Mika", + "pid": [ + ], + "rank": 10, + "surname": "Prunnila" + }, + { + "affiliation": [ + ], + "fullname": "Sudiwala, Rashmi V.", + "name": "Rashmi V.", + "pid": [ + ], + "rank": 11, + "surname": "Sudiwala" + }, + { + "affiliation": [ + ], + "fullname": "Whall, Terry E.", + "name": "Terry E.", + "pid": [ + ], + "rank": 12, + "surname": "Whall" + }, + { + "affiliation": [ + ], + "fullname": "Mauskopf", + "name": "", + "pid": [ + ], + "rank": 13, + "surname": "" + }, + { + "affiliation": [ + ], + "fullname": " P. D. ", + "name": "", + "pid": [ + ], + "rank": 14, + "surname": "" + } + ], + "bestaccessright": null, + "publisher": { + "value": null + }, + "collectedfrom": [ + { + "key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747", + "value": "VIRTA" + } + ], + "context": [ + ], + "contributor": [ + ], + "country": [ + { + "classid": "DE", + "classname": "DE", + "schemeid": "dnet:countries", + "schemename": "dnet:countries" + } + ], + "coverage": [ + ], + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": true, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "dateofacceptance": { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "value": "7 oct 1970" + }, + "dateofcollection": "", + "dateoftransformation": "2020-04-22T12:34:08.009Z", + "description": [ + ], + "externalReference": [ + ], + "extraInfo": [ + ], + "format": [ + ], + "fulltext": [ + ], + "id": "50|CSC_________::2250a70c903c6ac6e4c01438259e9375", + "instance": [ + { + "pid": [ + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1007/s109090161569x" + }, + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1008/abcd" + } + ], + "alternateIdentifier": [ + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1007/s109090161569x" + }, + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1009/qwerty" + } + ], + "accessright": { + "classid": "CLOSED", + "classname": "CLOSED", + "schemeid": "dnet:access_modes", + "schemename": "dnet:access_modes" + }, + "collectedfrom": { + "key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747", + "value": "VIRTA" + }, + "dateofacceptance": { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "value": "2016-01-01" + }, + "distributionlocation": "", + "hostedby": { + "key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747", + "value": "VIRTA" + }, + "instancetype": { + "classid": "Comment/debate", + "classname": "Comment/debate", + "schemeid": "dnet:publication_resource", + "schemename": "dnet:publication_resource" + }, + "url": [ + "http://juuli.fi/Record/0275158616", + "http://dx.doi.org/10.1007/s109090161569x" + ] + } + ], + "journal": { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "edition": "", + "ep": " 7", + "iss": "9 March", + "issnLinking": "", + "issnOnline": "", + "issnPrinted": "0022-2291", + "name": "Journal of Low Temperature Physics - Early Acces", + "sp": "1 ", + "vol": "" + }, + "language": { + "classid": "UNKNOWN", + "classname": "UNKNOWN", + "schemeid": "dnet:languages", + "schemename": "dnet:languages" + }, + "lastupdatetimestamp": 1591283286319, + "oaiprovenance": { + "originDescription": { + "altered": true, + "baseURL": "https%3A%2F%2Fvirta-jtp.csc.fi%2Fapi%2Fcerif", + "datestamp": "2019-07-30", + "harvestDate": "2020-04-22T11:04:38.685Z", + "identifier": "oai:virta-jtp.csc.fi:Publications/0275158616", + "metadataNamespace": "" + } + }, + "originalId": [ + "CSC_________::2250a70c903c6ac6e4c01438259e9375" + ], + "pid": [ + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1007/s109090161569x" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1007/s109090161569x" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "" + } + ], + "relevantdate": [ + ], + "resourcetype": { + "classid": "0001", + "classname": "0001", + "schemeid": "dnet:dataCite_resource", + "schemename": "dnet:dataCite_resource" + }, + "resulttype": { + "classid": "publication", + "classname": "publication", + "schemeid": "dnet:result_typologies", + "schemename": "dnet:result_typologies" + }, + "source": [ + ], + "subject": [ + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "ta213" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "infrared detectors" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "lens antennas" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "silicon" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "slot antennas" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "strained silicon" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "cold electron bolometers" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "doped silicon" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "measure noise" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "noise equivalent power" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "optical characterisation" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "optical response" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "photon noise" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "silicon absorbers" + } + ], + "title": [ + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemeid": "dnet:dataCite_title", + "schemename": "dnet:dataCite_title" + }, + "value": "Optical response of strained- and unstrained-silicon cold-electron bolometers test" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemeid": "dnet:dataCite_title", + "schemename": "dnet:dataCite_title" + }, + "value": "test test 123 test" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemeid": "dnet:dataCite_title", + "schemename": "dnet:dataCite_title" + }, + "value": "omic" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemeid": "dnet:dataCite_title", + "schemename": "dnet:dataCite_title" + }, + "value": "「マキャベリ的知性と心の理論の進化論」 リチャード・バーン, アンドリュー・ホワイトゥン 編/藤田和生, 山下博志, 友永雅巳 監訳" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemeid": "dnet:dataCite_title", + "schemename": "dnet:dataCite_title" + }, + "value": "-" + } + ] +} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result_missing_invisible.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result_missing_invisible.json new file mode 100644 index 000000000..c0b5a959a --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result_missing_invisible.json @@ -0,0 +1,922 @@ +{ + "author": [ + { + "affiliation": [], + "fullname": "Brien, Tom", + "name": "Tom", + "pid": [ + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "ORCID12", + "classname": "ORCID12", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "0000-0001-9613-6639" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "ORCID12", + "classname": "ORCID12", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "https://orcid.org/0000-0001-9613-6639" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "sysimport:crosswalk:entityregistry", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "orcid", + "classname": "ORCID12", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "0000-0001-9613-6639" + } + ], + "rank": 1, + "surname": "Brien" + }, + { + "affiliation": [], + "fullname": "Ade, Peter", + "name": "Peter", + "pid": [ + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "xyz", + "classname": "XYZ", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "qwerty" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "ORCID", + "classname": "ORCID", + "schemeid": "", + "schemename": "" + }, + "value": "asdasd" + } + ], + "rank": 2, + "surname": "Ade" + }, + { + "affiliation": [], + "fullname": "Barry, Peter S.", + "name": "Peter S.", + "pid": null, + "rank": 3, + "surname": "Barry" + }, + { + "affiliation": [], + "fullname": "Dunscombe, Chris J.", + "name": "Chris J.", + "pid": [], + "rank": 4, + "surname": "Dunscombe" + }, + { + "affiliation": [], + "fullname": "Leadley, David R.", + "name": "David R.", + "pid": [], + "rank": 5, + "surname": "Leadley" + }, + { + "affiliation": [], + "fullname": "Morozov, Dmitry V.", + "name": "Dmitry V.", + "pid": [], + "rank": 6, + "surname": "Morozov" + }, + { + "affiliation": [], + "fullname": "Myronov, Maksym", + "name": "Maksym", + "pid": [], + "rank": 7, + "surname": "Myronov" + }, + { + "affiliation": [], + "fullname": "Parker, Evan", + "name": "Evan", + "pid": [], + "rank": 8, + "surname": "Parker" + }, + { + "affiliation": [], + "fullname": "Prest, Martin J.", + "name": "Martin J.", + "pid": [], + "rank": 9, + "surname": "Prest" + }, + { + "affiliation": [], + "fullname": "Prunnila, Mika", + "name": "Mika", + "pid": [], + "rank": 10, + "surname": "Prunnila" + }, + { + "affiliation": [], + "fullname": "Sudiwala, Rashmi V.", + "name": "Rashmi V.", + "pid": [], + "rank": 11, + "surname": "Sudiwala" + }, + { + "affiliation": [], + "fullname": "Whall, Terry E.", + "name": "Terry E.", + "pid": [], + "rank": 12, + "surname": "Whall" + }, + { + "affiliation": [], + "fullname": "Mauskopf", + "name": "", + "pid": [], + "rank": 13, + "surname": "" + }, + { + "affiliation": [], + "fullname": " P. D. ", + "name": "", + "pid": [], + "rank": 14, + "surname": "" + } + ], + "bestaccessright": null, + "publisher": { + "value": null + }, + "collectedfrom": [ + { + "key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747", + "value": "VIRTA" + } + ], + "context": [], + "contributor": [], + "country": [ + { + "classid": "DE", + "classname": "DE", + "schemeid": "dnet:countries", + "schemename": "dnet:countries" + } + ], + "coverage": [], + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "dateofacceptance": { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "value": "7 oct 1970" + }, + "dateofcollection": "", + "dateoftransformation": "2020-04-22T12:34:08.009Z", + "description": [], + "externalReference": [], + "extraInfo": [], + "format": [], + "fulltext": [], + "id": "50|CSC_________::2250a70c903c6ac6e4c01438259e9375", + "instance": [ + { + "pid": [ + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1007/s109090161569x" + }, + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1008/abcd" + } + ], + "alternateIdentifier": [ + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1007/s109090161569x" + }, + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1009/qwerty" + } + ], + "accessright": { + "classid": "CLOSED", + "classname": "CLOSED", + "schemeid": "dnet:access_modes", + "schemename": "dnet:access_modes" + }, + "collectedfrom": { + "key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747", + "value": "VIRTA" + }, + "dateofacceptance": { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "value": "2016-01-01" + }, + "distributionlocation": "", + "hostedby": { + "key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747", + "value": "VIRTA" + }, + "instancetype": { + "classid": "Comment/debate", + "classname": "Comment/debate", + "schemeid": "dnet:publication_resource", + "schemename": "dnet:publication_resource" + }, + "url": [ + "http://juuli.fi/Record/0275158616", + "http://dx.doi.org/10.1007/s109090161569x" + ] + } + ], + "journal": { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "edition": "", + "ep": " 7", + "iss": "9 March", + "issnLinking": "", + "issnOnline": "", + "issnPrinted": "0022-2291", + "name": "Journal of Low Temperature Physics - Early Acces", + "sp": "1 ", + "vol": "" + }, + "language": { + "classid": "UNKNOWN", + "classname": "UNKNOWN", + "schemeid": "dnet:languages", + "schemename": "dnet:languages" + }, + "lastupdatetimestamp": 1591283286319, + "oaiprovenance": { + "originDescription": { + "altered": true, + "baseURL": "https%3A%2F%2Fvirta-jtp.csc.fi%2Fapi%2Fcerif", + "datestamp": "2019-07-30", + "harvestDate": "2020-04-22T11:04:38.685Z", + "identifier": "oai:virta-jtp.csc.fi:Publications/0275158616", + "metadataNamespace": "" + } + }, + "originalId": [ + "CSC_________::2250a70c903c6ac6e4c01438259e9375" + ], + "pid": [ + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1007/s109090161569x" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1007/s109090161569x" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "" + } + ], + "relevantdate": [], + "resourcetype": { + "classid": "0001", + "classname": "0001", + "schemeid": "dnet:dataCite_resource", + "schemename": "dnet:dataCite_resource" + }, + "resulttype": { + "classid": "publication", + "classname": "publication", + "schemeid": "dnet:result_typologies", + "schemename": "dnet:result_typologies" + }, + "source": [], + "subject": [ + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "ta213" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "infrared detectors" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "lens antennas" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "silicon" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "slot antennas" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "strained silicon" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "cold electron bolometers" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "doped silicon" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "measure noise" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "noise equivalent power" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "optical characterisation" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "optical response" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "photon noise" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "value": "silicon absorbers" + } + ], + "title": [ + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemeid": "dnet:dataCite_title", + "schemename": "dnet:dataCite_title" + }, + "value": "Optical response of strained- and unstrained-silicon cold-electron bolometers test" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemeid": "dnet:dataCite_title", + "schemename": "dnet:dataCite_title" + }, + "value": "test test 123 test" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemeid": "dnet:dataCite_title", + "schemename": "dnet:dataCite_title" + }, + "value": "omic" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemeid": "dnet:dataCite_title", + "schemename": "dnet:dataCite_title" + }, + "value": "「マキャベリ的知性と心の理論の進化論」 リチャード・バーン, アンドリュー・ホワイトゥン 編/藤田和生, 山下博志, 友永雅巳 監訳" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemeid": "dnet:dataCite_title", + "schemename": "dnet:dataCite_title" + }, + "value": "-" + } + ] +} \ No newline at end of file From 59a250337c473f6153e3e3a1771f47dd6830690f Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 24 Jan 2022 18:02:07 +0100 Subject: [PATCH 45/51] [graph resolution] drop output path at the beginning --- .../dhp/oa/graph/resolution/oozie_app/workflow.xml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml index 3cd08bc9b..74e792f07 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml @@ -19,12 +19,21 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + yarn From 3c60e53a96b68669e64d09aadc05031408d05cb9 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 3 Feb 2022 14:47:08 +0100 Subject: [PATCH 46/51] [stats-wf]fixed the result_result creation for monitor PR#190 on beta --- .../graph/stats/oozie_app/scripts/step13.sql | 14 ++++++------- .../graph/stats/oozie_app/scripts/step14.sql | 10 +++++----- .../graph/stats/oozie_app/scripts/step15.sql | 8 ++++---- .../graph/stats/oozie_app/scripts/step2.sql | 20 +++++++++---------- .../scripts/step20-createMonitorDB.sql | 2 +- .../graph/stats/oozie_app/scripts/step3.sql | 20 +++++++++---------- .../graph/stats/oozie_app/scripts/step4.sql | 20 +++++++++---------- .../graph/stats/oozie_app/scripts/step5.sql | 20 +++++++++---------- .../graph/stats/oozie_app/scripts/step6.sql | 12 ++++++----- .../graph/stats/oozie_app/scripts/step7.sql | 2 +- .../graph/stats/oozie_app/scripts/step8.sql | 2 +- .../graph/stats/oozie_app/scripts/step9.sql | 2 +- 12 files changed, 67 insertions(+), 65 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql index 315d296fc..947c91072 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql @@ -14,7 +14,7 @@ LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource @@ -25,7 +25,7 @@ LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource @@ -36,7 +36,7 @@ LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource @@ -47,7 +47,7 @@ LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_sources AS SELECT * FROM ${stats_db_name}.publication_sources @@ -76,8 +76,8 @@ join ${openaire_db_name}.result r1 on rel.source=r1.id join ${openaire_db_name}.result r2 on r2.id=rel.target where reltype='resultResult' and r1.resulttype.classname!=r2.resulttype.classname - and r1.datainfo.deletedbyinference=false - and r2.datainfo.deletedbyinference=false + and r1.datainfo.deletedbyinference=false and r1.datainfo.invisible = FALSE + and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE and r1.resulttype.classname != 'other' and r2.resulttype.classname != 'other' - and rel.datainfo.deletedbyinference=false; \ No newline at end of file + and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql index 00a6913bc..e3a33a893 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql @@ -8,22 +8,22 @@ CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses AS SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses -where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false; +where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses AS SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses -where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false; +where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses AS SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses -where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false; +where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses AS SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses -where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false; +where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS SELECT * FROM ${stats_db_name}.publication_licenses @@ -46,7 +46,7 @@ FROM ( LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false) d on o.datasource = d.id; + WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; -- ANALYZE TABLE ${stats_db_name}.publication_licenses COMPUTE STATISTICS; -- ANALYZE TABLE ${stats_db_name}.publication_licenses COMPUTE STATISTICS FOR COLUMNS; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql index 8e66e05c0..f46b65171 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql @@ -9,22 +9,22 @@ CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed as select substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst -where r.datainfo.deletedbyinference=false; +where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE; CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed as select substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst -where r.datainfo.deletedbyinference=false; +where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE; CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed as select substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst -where r.datainfo.deletedbyinference=false; +where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE; CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed as select substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst -where r.datainfo.deletedbyinference=false; +where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE; CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as select * from ${stats_db_name}.publication_refereed diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql index bb0d0ac6c..468a42045 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql @@ -38,13 +38,13 @@ SELECT substr(p.id, 4) as id, case when size(p.description) > 0 then true else false end as abstract, 'publication' as type from ${openaire_db_name}.publication p -where p.datainfo.deletedbyinference = false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.publication_classifications AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype -where p.datainfo.deletedbyinference = false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.publication_concepts AS SELECT substr(p.id, 4) as id, case @@ -53,45 +53,45 @@ SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept from ${openaire_db_name}.publication p LATERAL VIEW explode(p.context) contexts as context -where p.datainfo.deletedbyinference = false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.publication_datasources as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance - where p.datainfo.deletedbyinference = false) p + where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false) p LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference = false) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; CREATE TABLE ${stats_db_name}.publication_languages AS select substr(p.id, 4) as id, p.language.classname as language FROM ${openaire_db_name}.publication p -where p.datainfo.deletedbyinference = false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.publication_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.originalid) oids AS ids -where p.datainfo.deletedbyinference = false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.publication_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.pid) pids AS ppid -where p.datainfo.deletedbyinference = false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.publication_topics as select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.subject) subjects AS subject -where p.datainfo.deletedbyinference = false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.publication_citations AS SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.publication p lateral view explode(p.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" - and p.datainfo.deletedbyinference = false; \ No newline at end of file + and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index e72378f56..2528e904d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -81,7 +81,7 @@ compute stats TARGET.result_sources; create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_topics; -create table TARGET.result_result stored as parquet as select * from SOURCE.result_result orig where exists (select 1 from TARGET.result r where r.id=orig.source or r.id=orig.target); +create table TARGET.result_result stored as parquet as select * from SOURCE.result_result orig where exists (select 1 from TARGET.result r where r.id=orig.source) or exists (select 1 from TARGET.result r where r.id=orig.target); compute stats TARGET.result_result; -- datasources diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql index 953eaad6a..76a5e5a48 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql @@ -38,20 +38,20 @@ SELECT substr(d.id, 4) AS id, CASE WHEN SIZE(d.description) > 0 THEN TRUE ELSE FALSE end AS abstract, 'dataset' AS type FROM ${openaire_db_name}.dataset d -WHERE d.datainfo.deletedbyinference = FALSE; +WHERE d.datainfo.deletedbyinference = FALSE and d.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.dataset_citations AS SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.dataset d LATERAL VIEW explode(d.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" - and d.datainfo.deletedbyinference = false; + and d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.dataset_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype -where p.datainfo.deletedbyinference = false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.dataset_concepts AS SELECT substr(p.id, 4) as id, case @@ -60,7 +60,7 @@ SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.context) contexts as context -where p.datainfo.deletedbyinference = false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.dataset_datasources AS SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource @@ -68,31 +68,31 @@ FROM ( SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) AS datasource FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance) instances AS instance - where p.datainfo.deletedbyinference = false) p + where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false) p LEFT OUTER JOIN ( SELECT substr(d.id, 4) id FROM ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference = false) d ON p.datasource = d.id; + WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; CREATE TABLE ${stats_db_name}.dataset_languages AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.dataset p -where p.datainfo.deletedbyinference = false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.dataset_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.originalid) oids AS ids -where p.datainfo.deletedbyinference = false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.dataset_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.pid) pids AS ppid -where p.datainfo.deletedbyinference = false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.dataset_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.subject) subjects AS subject -where p.datainfo.deletedbyinference = false; \ No newline at end of file +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql index 0210dc8cb..dc71f41f1 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql @@ -38,20 +38,20 @@ SELECT substr(s.id, 4) as id, CASE WHEN SIZE(s.description) > 0 THEN TRUE ELSE FALSE END AS abstract, 'software' as type from ${openaire_db_name}.software s -where s.datainfo.deletedbyinference = false; +where s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.software_citations AS SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.software s LATERAL VIEW explode(s.extrainfo) citations as citation where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" - and s.datainfo.deletedbyinference = false; + and s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.software_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype -where p.datainfo.deletedbyinference = false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.software_concepts AS SELECT substr(p.id, 4) as id, case @@ -60,7 +60,7 @@ SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.context) contexts AS context -where p.datainfo.deletedbyinference = false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.software_datasources AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource @@ -68,31 +68,31 @@ FROM ( SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance) instances AS instance - where p.datainfo.deletedbyinference = false) p + where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false) p LEFT OUTER JOIN ( SELECT substr(d.id, 4) id FROM ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference = false) d ON p.datasource = d.id; + WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; CREATE TABLE ${stats_db_name}.software_languages AS select substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.software p -where p.datainfo.deletedbyinference = false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.software_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.originalid) oids AS ids -where p.datainfo.deletedbyinference = false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.software_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.pid) pids AS ppid -where p.datainfo.deletedbyinference = false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.software_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.subject) subjects AS subject -where p.datainfo.deletedbyinference = false; \ No newline at end of file +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql index f7b302186..353aa98b7 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql @@ -37,19 +37,19 @@ SELECT substr(o.id, 4) AS id, CASE WHEN SIZE(o.description) > 0 THEN TRUE ELSE FALSE END AS abstract, 'other' AS type FROM ${openaire_db_name}.otherresearchproduct o -WHERE o.datainfo.deletedbyinference = FALSE; +WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible=false; -- Otherresearchproduct_citations CREATE TABLE ${stats_db_name}.otherresearchproduct_citations AS SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" - and o.datainfo.deletedbyinference = false; + and o.datainfo.deletedbyinference = false and o.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype -where p.datainfo.deletedbyinference = false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS SELECT substr(p.id, 4) as id, case @@ -57,33 +57,33 @@ SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context -where p.datainfo.deletedbyinference = false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance - where p.datainfo.deletedbyinference = false) p + where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false) p LEFT OUTER JOIN(SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference = false) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; CREATE TABLE ${stats_db_name}.otherresearchproduct_languages AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.otherresearchproduct p -where p.datainfo.deletedbyinference = false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.otherresearchproduct_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids -where p.datainfo.deletedbyinference = false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.otherresearchproduct_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid -where p.datainfo.deletedbyinference = false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.otherresearchproduct_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject -where p.datainfo.deletedbyinference = false; \ No newline at end of file +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index 378e0f17b..b5e2eb37b 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -5,24 +5,26 @@ ------------------------------------------------------ CREATE TABLE ${stats_db_name}.project_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid -FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids; +FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids +where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false; + CREATE TABLE ${stats_db_name}.project_organizations AS SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization from ${openaire_db_name}.relation r WHERE r.reltype = 'projectOrganization' - and r.datainfo.deletedbyinference = false; + and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.project_results AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance FROM ${openaire_db_name}.relation r WHERE r.reltype = 'resultProject' - and r.datainfo.deletedbyinference = false; + and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; create table ${stats_db_name}.project_classification as select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3 from ${openaire_db_name}.project p lateral view explode(p.h2020classification) classifs as class -where p.datainfo.deletedbyinference=false and class.h2020programme is not null; +where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null; CREATE TABLE ${stats_db_name}.project_tmp ( @@ -72,7 +74,7 @@ SELECT substr(p.id, 4) AS id, p.code.value AS code, p.totalcost AS totalcost FROM ${openaire_db_name}.project p -WHERE p.datainfo.deletedbyinference = false; +WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; create table ${stats_db_name}.funder as select distinct xpath_string(fund, '//funder/id') as id, diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index b3cbc9b41..a57966abf 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -127,7 +127,7 @@ CREATE TABLE ${stats_db_name}.result_organization AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype = 'resultOrganization' - and r.datainfo.deletedbyinference = false; + and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.result_projects AS select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 76d31eb5e..4956daf7f 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -44,7 +44,7 @@ FROM ${openaire_db_name}.datasource d1 LATERAL VIEW EXPLODE(originalid) temp AS originalidd WHERE originalidd like "piwik:%") AS d2 ON d1.id = d2.id -WHERE d1.datainfo.deletedbyinference = FALSE; +WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false; -- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table. -- Creating a temporary dual table that will be removed after the following insert diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql index a1cb46185..c73aa811c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql @@ -9,7 +9,7 @@ SELECT substr(o.id, 4) as id, o.legalshortname.value as legalshortname, o.country.classid as country FROM ${openaire_db_name}.organization o -WHERE o.datainfo.deletedbyinference = FALSE; +WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible = FALSE; CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS SELECT organization AS id, id AS datasource From bcfdf9a0d70d3ebe3b42134c18c50b2f58ea4f9a Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 3 Feb 2022 16:49:31 +0100 Subject: [PATCH 47/51] iis repository with https --- dhp-workflows/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index 53d029467..143178560 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -44,7 +44,7 @@ iis-releases iis releases plugin repository - http://maven.ceon.pl/artifactory/iis-releases + https://maven.ceon.pl/artifactory/iis-releases default From 493caef358d37a4f4d32223b0e1923bc7209885a Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 4 Feb 2022 14:51:25 +0100 Subject: [PATCH 48/51] [stats-wf]fixed the result_result table related to PR#191 --- .scalafmt.conf | 21 +++++++++++++++++++ dhp-workflows/dhp-aggregation/.scalafmt.conf | 21 +++++++++++++++++++ dhp-workflows/dhp-graph-mapper/.scalafmt.conf | 21 +++++++++++++++++++ .../scripts/step20-createMonitorDB.sql | 6 +++++- 4 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 .scalafmt.conf create mode 100644 dhp-workflows/dhp-aggregation/.scalafmt.conf create mode 100644 dhp-workflows/dhp-graph-mapper/.scalafmt.conf diff --git a/.scalafmt.conf b/.scalafmt.conf new file mode 100644 index 000000000..0b5dbe0b4 --- /dev/null +++ b/.scalafmt.conf @@ -0,0 +1,21 @@ +style = defaultWithAlign + +align.openParenCallSite = false +align.openParenDefnSite = false +align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}] +continuationIndent.callSite = 2 +continuationIndent.defnSite = 2 +danglingParentheses = true +indentOperator = spray +maxColumn = 120 +newlines.alwaysBeforeTopLevelStatements = true +project.excludeFilters = [".*\\.sbt"] +rewrite.rules = [AvoidInfix] +rewrite.rules = [ExpandImportSelectors] +rewrite.rules = [RedundantBraces] +rewrite.rules = [RedundantParens] +rewrite.rules = [SortImports] +rewrite.rules = [SortModifiers] +rewrite.rules = [PreferCurlyFors] +spaces.inImportCurlyBraces = false +unindentTopLevelOperators = true \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/.scalafmt.conf b/dhp-workflows/dhp-aggregation/.scalafmt.conf new file mode 100644 index 000000000..0b5dbe0b4 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/.scalafmt.conf @@ -0,0 +1,21 @@ +style = defaultWithAlign + +align.openParenCallSite = false +align.openParenDefnSite = false +align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}] +continuationIndent.callSite = 2 +continuationIndent.defnSite = 2 +danglingParentheses = true +indentOperator = spray +maxColumn = 120 +newlines.alwaysBeforeTopLevelStatements = true +project.excludeFilters = [".*\\.sbt"] +rewrite.rules = [AvoidInfix] +rewrite.rules = [ExpandImportSelectors] +rewrite.rules = [RedundantBraces] +rewrite.rules = [RedundantParens] +rewrite.rules = [SortImports] +rewrite.rules = [SortModifiers] +rewrite.rules = [PreferCurlyFors] +spaces.inImportCurlyBraces = false +unindentTopLevelOperators = true \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/.scalafmt.conf b/dhp-workflows/dhp-graph-mapper/.scalafmt.conf new file mode 100644 index 000000000..0b5dbe0b4 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/.scalafmt.conf @@ -0,0 +1,21 @@ +style = defaultWithAlign + +align.openParenCallSite = false +align.openParenDefnSite = false +align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}] +continuationIndent.callSite = 2 +continuationIndent.defnSite = 2 +danglingParentheses = true +indentOperator = spray +maxColumn = 120 +newlines.alwaysBeforeTopLevelStatements = true +project.excludeFilters = [".*\\.sbt"] +rewrite.rules = [AvoidInfix] +rewrite.rules = [ExpandImportSelectors] +rewrite.rules = [RedundantBraces] +rewrite.rules = [RedundantParens] +rewrite.rules = [SortImports] +rewrite.rules = [SortModifiers] +rewrite.rules = [PreferCurlyFors] +spaces.inImportCurlyBraces = false +unindentTopLevelOperators = true \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 2528e904d..4e5f7a373 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -81,7 +81,11 @@ compute stats TARGET.result_sources; create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_topics; -create table TARGET.result_result stored as parquet as select * from SOURCE.result_result orig where exists (select 1 from TARGET.result r where r.id=orig.source) or exists (select 1 from TARGET.result r where r.id=orig.target); +create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result); +create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result); +create table TARGET.result_result as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; +drop view TARGET.foo1; +drop view TARGET.foo2; compute stats TARGET.result_result; -- datasources From 78be2975f0d324bb4e8947f93f2609d77eb84ec2 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 7 Feb 2022 11:22:08 +0100 Subject: [PATCH 49/51] [stats-wf]fixed another typo related to PR#193 --- .../oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 4e5f7a373..9e4edb44a 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -130,7 +130,7 @@ compute stats TARGET.indi_result_has_cc_licence; create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_result_has_cc_licence_url; -create view TARGET.indi_funder_country_collab stored as select * from SOURCE.indi_funder_country_collab; +create view TARGET.indi_funder_country_collab stored as parquet as select * from SOURCE.indi_funder_country_collab; create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_result_with_orcid; From 9d6203f79b6a9d16764ec9cc68c287b8da7a1fde Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Wed, 23 Feb 2022 15:00:53 +0100 Subject: [PATCH 50/51] test mapping datasource --- .../oa/provision/XmlRecordFactoryTest.java | 30 +++++++++++++++++++ .../dnetlib/dhp/oa/provision/datasource.json | 1 + 2 files changed, 31 insertions(+) create mode 100644 dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/datasource.json diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java index 2b5e08e92..501784872 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java @@ -8,6 +8,7 @@ import java.io.IOException; import java.io.StringReader; import java.util.List; +import eu.dnetlib.dhp.schema.oaf.Datasource; import org.apache.commons.io.IOUtils; import org.dom4j.Document; import org.dom4j.DocumentException; @@ -129,4 +130,33 @@ public class XmlRecordFactoryTest { System.out.println(doc.asXML()); assertEquals("", doc.valueOf("//rel/validated")); } + + @Test + public void testDatasource() throws IOException, DocumentException { + final ContextMapper contextMapper = new ContextMapper(); + + final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, + XmlConverterJob.schemaLocation); + + final Datasource d = OBJECT_MAPPER + .readValue(IOUtils.toString(getClass().getResourceAsStream("datasource.json")), Datasource.class); + + final String xml = xmlRecordFactory.build(new JoinedEntity<>(d)); + + assertNotNull(xml); + + final Document doc = new SAXReader().read(new StringReader(xml)); + + assertNotNull(doc); + + System.out.println(doc.asXML()); + + // TODO add assertions based of values extracted from the XML record + + assertEquals("National", doc.valueOf("//jurisdiction/@classname")); + assertEquals("true", doc.valueOf("//thematic")); + assertEquals("Journal article", doc.valueOf("//contentpolicy/@classname")); + assertEquals("Journal archive", doc.valueOf("//datasourcetypeui/@classname")); + + } } diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/datasource.json b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/datasource.json new file mode 100644 index 000000000..ae069b8b5 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/datasource.json @@ -0,0 +1 @@ +{"collectedfrom":[{"key":"10|openaire____::13068d7823ea0bd86516ac2cb66e96ba","value":"Jurnal Fakultas Sastra Universitas Ekasakti","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1645012035118,"id":"10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7","originalId":["274269ac6f3b::2579-5449","piwik:13"],"pid":[],"dateofcollection":"2020-01-21","dateoftransformation":null,"extraInfo":[],"oaiprovenance":null,"datasourcetype":{"classid":"pubsrepository::journal","classname":"pubsrepository::journal","schemeid":"dnet:datasource_typologies","schemename":"dnet:datasource_typologies"},"datasourcetypeui":{"classid":"pubsrepository::journal","classname":"Journal archive","schemeid":"dnet:datasource_typologies_ui","schemename":"dnet:datasource_typologies_ui"},"openairecompatibility":{"classid":"hostedBy","classname":"hostedBy","schemeid":"dnet:datasourceCompatibilityLevel","schemename":"dnet:datasourceCompatibilityLevel"},"officialname":{"value":"Jurnal Ilmiah Pendidikan Scholastic","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"englishname":{"value":"Jurnal Ilmiah Pendidikan Scholastic","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"websiteurl":{"value":"http://e-journal.sastra-unes.com/index.php/JIPS/index","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"logourl":null,"contactemail":{"value":"test@test.it","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"namespaceprefix":{"value":"ojs_25795449","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"latitude":{"value":"0.0","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"longitude":{"value":"0.0","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"dateofvalidation":null,"description":null,"subjects":[],"odnumberofitems":{"value":"0.0","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"odnumberofitemsdate":null,"odpolicies":null,"odlanguages":[],"odcontenttypes":[{"value":"Journal articles","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"accessinfopackage":[],"releasestartdate":null,"releaseenddate":null,"missionstatementurl":null,"dataprovider":{"value":false,"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"serviceprovider":{"value":false,"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"databaseaccesstype":null,"datauploadtype":null,"databaseaccessrestriction":null,"datauploadrestriction":null,"versioning":{"value":false,"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"citationguidelineurl":null,"qualitymanagementkind":null,"pidsystems":null,"certificates":null,"policies":[],"journal":{"name":"Jurnal Ilmiah Pendidikan Scholastic","issnPrinted":"2579-5449","issnOnline":"2579-5448","issnLinking":"2579-5447","ep":null,"iss":null,"sp":null,"vol":null,"edition":null,"conferenceplace":null,"conferencedate":null,"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"providedentitytypes":null,"providedproducttypes":null,"jurisdiction":{"classid":"National","classname":"National","schemeid":"eosc:jurisdictions","schemename":"eosc:jurisdictions"},"thematic":true,"knowledgegraph":true,"contentpolicies":[{"classid":"Journal article","classname":"Journal article","schemeid":"eosc:contentpolicies","schemename":"eosc:contentpolicies"}]} From ab36154e3e6d6799fb14fc9de860f09c7178c1dd Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 24 Feb 2022 12:16:17 +0100 Subject: [PATCH 51/51] added more ignores --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index f4fb46f2e..0a50a4947 100644 --- a/.gitignore +++ b/.gitignore @@ -27,4 +27,4 @@ spark-warehouse /**/job-override.properties /**/*.log /**/.factorypath - +/**/.scalafmt.conf