From c0265213a0b2119a30f00ec0013c6c88de3b826a Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Wed, 13 May 2020 12:00:27 +0200 Subject: [PATCH 01/23] partial implementation --- .../dhp/broker/model/EventFactory.java | 31 ++++----- .../eu/dnetlib/dhp/broker/model/Topic.java | 52 +++++++++++++++ .../broker/oa/GenerateEventsApplication.java | 55 +++++++--------- .../broker/oa/util/EnrichMissingAbstract.java | 30 +++++---- .../oa/util/EnrichMissingAuthorOrcid.java | 32 +++++----- .../oa/util/EnrichMissingOpenAccess.java | 31 ++++----- .../dhp/broker/oa/util/EnrichMissingPid.java | 28 ++++----- .../broker/oa/util/EnrichMissingProject.java | 30 ++++----- .../oa/util/EnrichMissingPublicationDate.java | 29 ++++----- .../broker/oa/util/EnrichMissingSubject.java | 30 +++++---- .../broker/oa/util/EnrichMoreOpenAccess.java | 29 ++++----- .../dhp/broker/oa/util/EnrichMorePid.java | 28 ++++----- .../dhp/broker/oa/util/EnrichMoreSubject.java | 30 +++++---- .../dhp/broker/oa/util/UpdateInfo.java | 59 ++++++++++++++--- .../dhp/broker/oa/util/UpdateMatcher.java | 63 +++++++++++++++++++ 15 files changed, 363 insertions(+), 194 deletions(-) create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateMatcher.java diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java index 0694556b2..9e5d98644 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java @@ -29,31 +29,32 @@ public class EventFactory { "yyyy-MM-dd" }; - public static Event newBrokerEvent(final Result source, final Result target, final UpdateInfo updateInfo) { + public static Event newBrokerEvent(final UpdateInfo updateInfo) { final long now = new Date().getTime(); final Event res = new Event(); - final Map map = createMapFromResult(target, source, updateInfo); + final Map map = createMapFromResult(updateInfo); - final String payload = createPayload(target, updateInfo); + final String payload = createPayload(updateInfo); final String eventId = calculateEventId( - updateInfo.getTopic(), target.getOriginalId().get(0), updateInfo.getHighlightValueAsString()); + updateInfo.getTopicPath(), updateInfo.getTarget().getOriginalId().get(0), + updateInfo.getHighlightValueAsString()); res.setEventId(eventId); res.setProducerId(PRODUCER_ID); res.setPayload(payload); res.setMap(map); - res.setTopic(updateInfo.getTopic()); + res.setTopic(updateInfo.getTopicPath()); res.setCreationDate(now); res.setExpiryDate(calculateExpiryDate(now)); res.setInstantMessage(false); return res; } - private static String createPayload(final Result result, final UpdateInfo updateInfo) { + private static String createPayload(final UpdateInfo updateInfo) { final OpenAireEventPayload payload = new OpenAireEventPayload(); // TODO @@ -62,32 +63,34 @@ public class EventFactory { return payload.toJSON(); } - private static Map createMapFromResult(final Result oaf, final Result source, - final UpdateInfo updateInfo) { + private static Map createMapFromResult(final UpdateInfo updateInfo) { final Map map = new HashMap<>(); - final List collectedFrom = oaf.getCollectedfrom(); + final Result source = updateInfo.getSource(); + final Result target = updateInfo.getTarget(); + + final List collectedFrom = target.getCollectedfrom(); if (collectedFrom.size() == 1) { map.put("target_datasource_id", collectedFrom.get(0).getKey()); map.put("target_datasource_name", collectedFrom.get(0).getValue()); } - final List ids = oaf.getOriginalId(); + final List ids = target.getOriginalId(); if (ids.size() > 0) { map.put("target_publication_id", ids.get(0)); } - final List titles = oaf.getTitle(); + final List titles = target.getTitle(); if (titles.size() > 0) { map.put("target_publication_title", titles.get(0)); } - final long date = parseDateTolong(oaf.getDateofacceptance().getValue()); + final long date = parseDateTolong(target.getDateofacceptance().getValue()); if (date > 0) { map.put("target_dateofacceptance", date); } - final List subjects = oaf.getSubject(); + final List subjects = target.getSubject(); if (subjects.size() > 0) { map .put( @@ -95,7 +98,7 @@ public class EventFactory { subjects.stream().map(StructuredProperty::getValue).collect(Collectors.toList())); } - final List authors = oaf.getAuthor(); + final List authors = target.getAuthor(); if (authors.size() > 0) { map .put( diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java new file mode 100644 index 000000000..29f6cbe3a --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java @@ -0,0 +1,52 @@ + +package eu.dnetlib.dhp.broker.model; + +public enum Topic { + + // ENRICHMENT MISSING + ENRICH_MISSING_OA_VERSION("ENRICH/MISSING/OPENACCESS_VERSION"), ENRICH_MISSING_ABSTRACT( + "ENRICH/MISSING/ABSTRACT"), ENRICH_MISSING_PUBLICATION_DATE( + "ENRICH/MISSING/PUBLICATION_DATE"), ENRICH_MISSING_PID( + "ENRICH/MISSING/PID"), ENRICH_MISSING_PROJECT("ENRICH/MISSING/PROJECT"), ENRICH_MISSING_SOFTWARE( + "ENRICH/MISSING/SOFTWARE"), ENRICH_MISSING_SUBJECT_MESHEUROPMC( + "ENRICH/MISSING/SUBJECT/MESHEUROPMC"), ENRICH_MISSING_SUBJECT_ARXIV( + "ENRICH/MISSING/SUBJECT/ARXIV"), ENRICH_MISSING_SUBJECT_JEL( + "ENRICH/MISSING/SUBJECT/JEL"), ENRICH_MISSING_SUBJECT_DDC( + "ENRICH/MISSING/SUBJECT/DDC"), ENRICH_MISSING_SUBJECT_ACM( + "ENRICH/MISSING/SUBJECT/ACM"), ENRICH_MISSING_SUBJECT_RVK( + "ENRICH/MISSING/SUBJECT/RVK"), ENRICH_MISSING_AUTHOR_ORCID( + "ENRICH/MISSING/AUTHOR/ORCID"), + + // ENRICHMENT MORE + ENRICH_MORE_PID("ENRICH/MORE/PID"), ENRICH_MORE_OA_VERSION("ENRICH/MORE/OPENACCESS_VERSION"), ENRICH_MORE_ABSTRACT( + "ENRICH/MORE/ABSTRACT"), ENRICH_MORE_PUBLICATION_DATE("ENRICH/MORE/PUBLICATION_DATE"), ENRICH_MORE_PROJECT( + "ENRICH/MORE/PROJECT"), ENRICH_MORE_SUBJECT_MESHEUROPMC( + "ENRICH/MORE/SUBJECT/MESHEUROPMC"), ENRICH_MORE_SUBJECT_ARXIV( + "ENRICH/MORE/SUBJECT/ARXIV"), ENRICH_MORE_SUBJECT_JEL( + "ENRICH/MORE/SUBJECT/JEL"), ENRICH_MORE_SUBJECT_DDC( + "ENRICH/MORE/SUBJECT/DDC"), ENRICH_MORE_SUBJECT_ACM( + "ENRICH/MORE/SUBJECT/ACM"), ENRICH_MORE_SUBJECT_RVK("ENRICH/MORE/SUBJECT/RVK"), + + // ADDITION + ADD_BY_PROJECT("ADD/BY_PROJECT"); + + Topic(final String path) { + this.path = path; + } + + protected String path; + + public String getPath() { + return this.path; + } + + public static Topic fromPath(final String path) { + for (final Topic t : Topic.values()) { + if (t.getPath().equals(path)) { + return t; + } + } + return null; + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java index 54d4ef36a..c4c167c13 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java @@ -14,8 +14,6 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.databind.ObjectMapper; - import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.broker.model.Event; import eu.dnetlib.dhp.broker.model.EventFactory; @@ -30,6 +28,7 @@ import eu.dnetlib.dhp.broker.oa.util.EnrichMoreOpenAccess; import eu.dnetlib.dhp.broker.oa.util.EnrichMorePid; import eu.dnetlib.dhp.broker.oa.util.EnrichMoreSubject; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.broker.oa.util.UpdateMatcher; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.oaf.Result; @@ -37,7 +36,16 @@ public class GenerateEventsApplication { private static final Logger log = LoggerFactory.getLogger(GenerateEventsApplication.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final UpdateMatcher enrichMissingAbstract = new EnrichMissingAbstract(); + private static final UpdateMatcher enrichMissingAuthorOrcid = new EnrichMissingAuthorOrcid(); + private static final UpdateMatcher enrichMissingOpenAccess = new EnrichMissingOpenAccess(); + private static final UpdateMatcher enrichMissingPid = new EnrichMissingPid(); + private static final UpdateMatcher enrichMissingProject = new EnrichMissingProject(); + private static final UpdateMatcher enrichMissingPublicationDate = new EnrichMissingPublicationDate(); + private static final UpdateMatcher enrichMissingSubject = new EnrichMissingSubject(); + private static final UpdateMatcher enrichMoreOpenAccess = new EnrichMoreOpenAccess(); + private static final UpdateMatcher enrichMorePid = new EnrichMorePid(); + private static final UpdateMatcher enrichMoreSubject = new EnrichMoreSubject(); public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( @@ -76,37 +84,22 @@ public class GenerateEventsApplication { } private List generateEvents(final Result... children) { - final List list = new ArrayList<>(); + final List> list = new ArrayList<>(); - for (final Result source : children) { - for (final Result target : children) { - if (source != target) { - list - .addAll( - findUpdates(source, target) - .stream() - .map(info -> EventFactory.newBrokerEvent(source, target, info)) - .collect(Collectors.toList())); - } - } + for (final Result target : children) { + list.addAll(enrichMissingAbstract.searchUpdatesForRecord(target, children)); + list.addAll(enrichMissingAuthorOrcid.searchUpdatesForRecord(target, children)); + list.addAll(enrichMissingOpenAccess.searchUpdatesForRecord(target, children)); + list.addAll(enrichMissingPid.searchUpdatesForRecord(target, children)); + list.addAll(enrichMissingProject.searchUpdatesForRecord(target, children)); + list.addAll(enrichMissingPublicationDate.searchUpdatesForRecord(target, children)); + list.addAll(enrichMissingSubject.searchUpdatesForRecord(target, children)); + list.addAll(enrichMoreOpenAccess.searchUpdatesForRecord(target, children)); + list.addAll(enrichMorePid.searchUpdatesForRecord(target, children)); + list.addAll(enrichMoreSubject.searchUpdatesForRecord(target, children)); } - return list; - } - - private List> findUpdates(final Result source, final Result target) { - final List> list = new ArrayList<>(); - list.addAll(EnrichMissingAbstract.findUpdates(source, target)); - list.addAll(EnrichMissingAuthorOrcid.findUpdates(source, target)); - list.addAll(EnrichMissingOpenAccess.findUpdates(source, target)); - list.addAll(EnrichMissingPid.findUpdates(source, target)); - list.addAll(EnrichMissingProject.findUpdates(source, target)); - list.addAll(EnrichMissingPublicationDate.findUpdates(source, target)); - list.addAll(EnrichMissingSubject.findUpdates(source, target)); - list.addAll(EnrichMoreOpenAccess.findUpdates(source, target)); - list.addAll(EnrichMorePid.findUpdates(source, target)); - list.addAll(EnrichMoreSubject.findUpdates(source, target)); - return list; + return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList()); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAbstract.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAbstract.java index 493d1f97c..6b6e35d1d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAbstract.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAbstract.java @@ -1,31 +1,35 @@ package eu.dnetlib.dhp.broker.oa.util; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.OpenAireEventPayload; +import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingAbstract extends UpdateInfo { +public class EnrichMissingAbstract extends UpdateMatcher { - public static List findUpdates(final Result source, final Result target) { - // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); - return Arrays.asList(); - } - - private EnrichMissingAbstract(final String highlightValue, final float trust) { - super("ENRICH/MISSING/ABSTRACT", highlightValue, trust); + public EnrichMissingAbstract() { + super(false); } @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getAbstracts().add(getHighlightValue()); + protected List> findUpdates(final Result source, final Result target) { + if (isMissing(target.getDescription()) && !isMissing(source.getDescription())) { + return Arrays.asList(generateUpdateInfo(source.getDescription().get(0).getValue(), source, target)); + } + return new ArrayList<>(); } @Override - public String getHighlightValueAsString() { - return getHighlightValue(); + public UpdateInfo generateUpdateInfo(final String highlightValue, final Result source, + final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_ABSTRACT, + highlightValue, source, target, + (p, s) -> p.getAbstracts().add(s), + s -> s); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAuthorOrcid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAuthorOrcid.java index 6899c62a3..d81427e05 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAuthorOrcid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAuthorOrcid.java @@ -4,28 +4,30 @@ package eu.dnetlib.dhp.broker.oa.util; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.OpenAireEventPayload; +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingAuthorOrcid extends UpdateInfo { +public class EnrichMissingAuthorOrcid extends UpdateMatcher> { - public static List findUpdates(final Result source, final Result target) { + public EnrichMissingAuthorOrcid() { + super(true); + } + + @Override + protected List>> findUpdates(final Result source, final Result target) { // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); return Arrays.asList(); } - private EnrichMissingAuthorOrcid(final String highlightValue, final float trust) { - super("ENRICH/MISSING/AUTHOR/ORCID", highlightValue, trust); - } - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - // TODO + public UpdateInfo> generateUpdateInfo(final Pair highlightValue, + final Result source, final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_AUTHOR_ORCID, + highlightValue, source, target, + (p, pair) -> p.getCreators().add(pair.getLeft() + " - ORCID: " + pair.getRight()), + pair -> pair.getLeft() + "::" + pair.getRight()); } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue(); - } - } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingOpenAccess.java index 9464130f3..9079ee24b 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingOpenAccess.java @@ -5,28 +5,29 @@ import java.util.Arrays; import java.util.List; import eu.dnetlib.broker.objects.Instance; -import eu.dnetlib.broker.objects.OpenAireEventPayload; +import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingOpenAccess extends UpdateInfo { +public class EnrichMissingOpenAccess extends UpdateMatcher { + + public EnrichMissingOpenAccess() { + super(true); + } + + @Override + protected List> findUpdates(final Result source, final Result target) { - public static List findUpdates(final Result source, final Result target) { - // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); return Arrays.asList(); } - private EnrichMissingOpenAccess(final Instance highlightValue, final float trust) { - super("ENRICH/MISSING/OPENACCESS_VERSION", highlightValue, trust); - } - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getInstances().add(getHighlightValue()); - } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue().getUrl(); + public UpdateInfo generateUpdateInfo(final Instance highlightValue, final Result source, + final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_OA_VERSION, + highlightValue, source, target, + (p, i) -> p.getInstances().add(i), + Instance::getUrl); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPid.java index 293d4993f..0b4045a0e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPid.java @@ -4,29 +4,29 @@ package eu.dnetlib.dhp.broker.oa.util; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.OpenAireEventPayload; import eu.dnetlib.broker.objects.Pid; +import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingPid extends UpdateInfo { +public class EnrichMissingPid extends UpdateMatcher { - public static List findUpdates(final Result source, final Result target) { + public EnrichMissingPid() { + super(true); + } + + @Override + protected List> findUpdates(final Result source, final Result target) { // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); return Arrays.asList(); } - private EnrichMissingPid(final Pid highlightValue, final float trust) { - super("ENRICH/MISSING/PID", highlightValue, trust); - } - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getPids().add(getHighlightValue()); - } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue().getType() + "::" + getHighlightValue().getValue(); + public UpdateInfo generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_PID, + highlightValue, source, target, + (p, pid) -> p.getPids().add(pid), + pid -> pid.getType() + "::" + pid.getValue()); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingProject.java index a22c179a2..45b16801c 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingProject.java @@ -4,30 +4,30 @@ package eu.dnetlib.dhp.broker.oa.util; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.OpenAireEventPayload; import eu.dnetlib.broker.objects.Project; +import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingProject extends UpdateInfo { +public class EnrichMissingProject extends UpdateMatcher { - public static List findUpdates(final Result source, final Result target) { + public EnrichMissingProject() { + super(true); + } + + @Override + protected List> findUpdates(final Result source, final Result target) { // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); return Arrays.asList(); } - private EnrichMissingProject(final Project highlightValue, final float trust) { - super("ENRICH/MISSING/PROJECT", highlightValue, trust); - } - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getProjects().add(getHighlightValue()); - } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue().getFunder() + "::" + getHighlightValue().getFundingProgram() - + getHighlightValue().getCode(); + public UpdateInfo generateUpdateInfo(final Project highlightValue, final Result source, + final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_PROJECT, + highlightValue, source, target, + (p, prj) -> p.getProjects().add(prj), + prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode()); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPublicationDate.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPublicationDate.java index 869dca264..7fcd2a66f 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPublicationDate.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPublicationDate.java @@ -4,28 +4,29 @@ package eu.dnetlib.dhp.broker.oa.util; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.OpenAireEventPayload; +import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingPublicationDate extends UpdateInfo { +public class EnrichMissingPublicationDate extends UpdateMatcher { - public static List findUpdates(final Result source, final Result target) { + public EnrichMissingPublicationDate() { + super(false); + } + + @Override + protected List> findUpdates(final Result source, final Result target) { // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); return Arrays.asList(); } - private EnrichMissingPublicationDate(final String highlightValue, final float trust) { - super("ENRICH/MISSING/PUBLICATION_DATE", highlightValue, trust); - } - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().setPublicationdate(getHighlightValue()); - } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue(); + public UpdateInfo generateUpdateInfo(final String highlightValue, final Result source, + final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_PUBLICATION_DATE, + highlightValue, source, target, + (p, date) -> p.setPublicationdate(date), + s -> s); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingSubject.java index a2ed5d043..4470bd9d9 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingSubject.java @@ -4,12 +4,19 @@ package eu.dnetlib.dhp.broker.oa.util; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.OpenAireEventPayload; +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingSubject extends UpdateInfo { +public class EnrichMissingSubject extends UpdateMatcher> { - public static List findUpdates(final Result source, final Result target) { + public EnrichMissingSubject() { + super(true); + } + + @Override + protected List>> findUpdates(final Result source, final Result target) { // MESHEUROPMC // ARXIV // JEL @@ -19,18 +26,15 @@ public class EnrichMissingSubject extends UpdateInfo { return Arrays.asList(); } - private EnrichMissingSubject(final String subjectClassification, final String highlightValue, final float trust) { - super("ENRICH/MISSING/SUBJECT/" + subjectClassification, highlightValue, trust); - } - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getSubjects().add(getHighlightValue()); - } + public UpdateInfo> generateUpdateInfo(final Pair highlightValue, + final Result source, final Result target) { - @Override - public String getHighlightValueAsString() { - return getHighlightValue(); + return new UpdateInfo<>( + Topic.fromPath("ENRICH/MISSING/SUBJECT/" + highlightValue.getLeft()), + highlightValue, source, target, + (p, pair) -> p.getSubjects().add(pair.getRight()), + pair -> pair.getLeft() + "::" + pair.getRight()); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreOpenAccess.java index 4f1e88d3d..bc37ce659 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreOpenAccess.java @@ -5,28 +5,29 @@ import java.util.Arrays; import java.util.List; import eu.dnetlib.broker.objects.Instance; -import eu.dnetlib.broker.objects.OpenAireEventPayload; +import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMoreOpenAccess extends UpdateInfo { +public class EnrichMoreOpenAccess extends UpdateMatcher { - public static List findUpdates(final Result source, final Result target) { + public EnrichMoreOpenAccess() { + super(true); + } + + @Override + protected List> findUpdates(final Result source, final Result target) { // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); return Arrays.asList(); } - private EnrichMoreOpenAccess(final Instance highlightValue, final float trust) { - super("ENRICH/MORE/OPENACCESS_VERSION", highlightValue, trust); - } - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getInstances().add(getHighlightValue()); - } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue().getUrl(); + public UpdateInfo generateUpdateInfo(final Instance highlightValue, final Result source, + final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MORE_OA_VERSION, + highlightValue, source, target, + (p, i) -> p.getInstances().add(i), + Instance::getUrl); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMorePid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMorePid.java index ecf2cf310..8cd67f553 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMorePid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMorePid.java @@ -4,29 +4,29 @@ package eu.dnetlib.dhp.broker.oa.util; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.OpenAireEventPayload; import eu.dnetlib.broker.objects.Pid; +import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMorePid extends UpdateInfo { +public class EnrichMorePid extends UpdateMatcher { - public static List findUpdates(final Result source, final Result target) { + public EnrichMorePid() { + super(true); + } + + @Override + protected List> findUpdates(final Result source, final Result target) { // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); return Arrays.asList(); } - private EnrichMorePid(final Pid highlightValue, final float trust) { - super("ENRICH/MORE/PID", highlightValue, trust); - } - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getPids().add(getHighlightValue()); - } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue().getType() + "::" + getHighlightValue().getValue(); + public UpdateInfo generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MORE_PID, + highlightValue, source, target, + (p, pid) -> p.getPids().add(pid), + pid -> pid.getType() + "::" + pid.getValue()); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreSubject.java index f29b86292..9e0d8e693 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreSubject.java @@ -4,12 +4,19 @@ package eu.dnetlib.dhp.broker.oa.util; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.OpenAireEventPayload; +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMoreSubject extends UpdateInfo { +public class EnrichMoreSubject extends UpdateMatcher> { - public static List findUpdates(final Result source, final Result target) { + public EnrichMoreSubject() { + super(true); + } + + @Override + protected List>> findUpdates(final Result source, final Result target) { // MESHEUROPMC // ARXIV // JEL @@ -19,18 +26,15 @@ public class EnrichMoreSubject extends UpdateInfo { return Arrays.asList(); } - private EnrichMoreSubject(final String subjectClassification, final String highlightValue, final float trust) { - super("ENRICH/MORE/SUBJECT/" + subjectClassification, highlightValue, trust); - } - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getSubjects().add(getHighlightValue()); - } + public UpdateInfo> generateUpdateInfo(final Pair highlightValue, + final Result source, final Result target) { - @Override - public String getHighlightValueAsString() { - return getHighlightValue(); + return new UpdateInfo<>( + Topic.fromPath("ENRICH/MORE/SUBJECT/" + highlightValue.getLeft()), + highlightValue, source, target, + (p, pair) -> p.getSubjects().add(pair.getRight()), + pair -> pair.getLeft() + "::" + pair.getRight()); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java index f7b6b69e9..1dfc14e5e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java @@ -1,36 +1,77 @@ package eu.dnetlib.dhp.broker.oa.util; +import java.util.function.BiConsumer; +import java.util.function.Function; + import eu.dnetlib.broker.objects.OpenAireEventPayload; +import eu.dnetlib.broker.objects.Publication; +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.schema.oaf.Result; -public abstract class UpdateInfo { +public final class UpdateInfo { - private final String topic; + private final Topic topic; private final T highlightValue; + private final Result source; + + private final Result target; + + private final BiConsumer compileHighlight; + + private final Function highlightToString; + private final float trust; - protected UpdateInfo(final String topic, final T highlightValue, final float trust) { + protected UpdateInfo(final Topic topic, final T highlightValue, final Result source, final Result target, + final BiConsumer compileHighlight, + final Function highlightToString) { this.topic = topic; this.highlightValue = highlightValue; - this.trust = trust; + this.source = source; + this.target = target; + this.compileHighlight = compileHighlight; + this.highlightToString = highlightToString; + this.trust = calculateTrust(source, target); } public T getHighlightValue() { return highlightValue; } + public Result getSource() { + return source; + } + + public Result getTarget() { + return target; + } + + private float calculateTrust(final Result source, final Result target) { + // TODO + return 0.9f; + } + + protected Topic getTopic() { + return topic; + } + + public String getTopicPath() { + return topic.getPath(); + } + public float getTrust() { return trust; } - public String getTopic() { - return topic; + public void compileHighlight(final OpenAireEventPayload payload) { + compileHighlight.accept(payload.getHighlight(), getHighlightValue()); } - abstract public void compileHighlight(OpenAireEventPayload payload); - - abstract public String getHighlightValueAsString(); + public String getHighlightValueAsString() { + return highlightToString.apply(getHighlightValue()); + } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateMatcher.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateMatcher.java new file mode 100644 index 000000000..3fd6d4027 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateMatcher.java @@ -0,0 +1,63 @@ + +package eu.dnetlib.dhp.broker.oa.util; + +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.lang3.StringUtils; + +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.Result; + +public abstract class UpdateMatcher { + + private final boolean multipleUpdate; + + public UpdateMatcher(final boolean multipleUpdate) { + this.multipleUpdate = multipleUpdate; + } + + public Collection> searchUpdatesForRecord(final Result res, final Result... others) { + + final Map> infoMap = new HashMap<>(); + + for (final Result source : others) { + if (source != res) { + for (final UpdateInfo info : findUpdates(source, res)) { + final String s = DigestUtils.md5Hex(info.getHighlightValueAsString()); + if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) { + } else { + infoMap.put(s, info); + } + } + } + } + + final Collection> values = infoMap.values(); + + if (values.isEmpty() || multipleUpdate) { + return values; + } else { + final UpdateInfo v = values + .stream() + .sorted((o1, o2) -> Float.compare(o1.getTrust(), o2.getTrust())) + .findFirst() + .get(); + return Arrays.asList(v); + } + } + + protected abstract List> findUpdates(Result source, Result target); + + protected abstract UpdateInfo generateUpdateInfo(final T highlightValue, final Result source, + final Result target); + + protected static boolean isMissing(final List> list) { + return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0).getValue()); + } + +} From 5ecacad70a0539bd94e336ad604cc32ac8e0ce1a Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 13 May 2020 17:01:11 +0200 Subject: [PATCH 02/23] fixed default resource typing in Oaf/Odf mapping --- .../raw/AbstractMdRecordToOafMapper.java | 168 +++++++++--------- 1 file changed, 84 insertions(+), 84 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index fd12716b4..be0b91022 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -64,7 +64,7 @@ public abstract class AbstractMdRecordToOafMapper { } protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier( - "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); + "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); protected AbstractMdRecordToOafMapper(final Map code2name) { this.code2name = code2name; @@ -75,20 +75,20 @@ public abstract class AbstractMdRecordToOafMapper { DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); final Document doc = DocumentHelper - .parseText( - xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)); + .parseText( + xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)); final String type = doc.valueOf("//dr:CobjCategory/@type"); final KeyValue collectedFrom = getProvenanceDatasource( - doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); + doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); if (collectedFrom == null) { return null; } final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) - ? collectedFrom - : getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name"); + ? collectedFrom + : getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name"); if (hostedBy == null) { return null; @@ -112,22 +112,21 @@ public abstract class AbstractMdRecordToOafMapper { } return keyValue( - createOpenaireId(10, dsId, true), - dsName); + createOpenaireId(10, dsId, true), + dsName); } protected List createOafs( - final Document doc, - final String type, - final KeyValue collectedFrom, - final KeyValue hostedBy, - final DataInfo info, - final long lastUpdateTimestamp) { + final Document doc, + final String type, + final KeyValue collectedFrom, + final KeyValue hostedBy, + final DataInfo info, + final long lastUpdateTimestamp) { final List oafs = new ArrayList<>(); switch (type.toLowerCase()) { - case "": case "publication": final Publication p = new Publication(); populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); @@ -138,7 +137,7 @@ public abstract class AbstractMdRecordToOafMapper { case "dataset": final Dataset d = new Dataset(); populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - d.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE); + d.setResulttype(DATASET_DEFAULT_RESULTTYPE); d.setStoragedate(prepareDatasetStorageDate(doc, info)); d.setDevice(prepareDatasetDevice(doc, info)); d.setSize(prepareDatasetSize(doc, info)); @@ -158,6 +157,7 @@ public abstract class AbstractMdRecordToOafMapper { s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); oafs.add(s); break; + case "": case "otherresearchproducts": default: final OtherResearchProduct o = new OtherResearchProduct(); @@ -179,10 +179,10 @@ public abstract class AbstractMdRecordToOafMapper { } private List addProjectRels( - final Document doc, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { + final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { final List res = new ArrayList<>(); @@ -196,15 +196,15 @@ public abstract class AbstractMdRecordToOafMapper { final String projectId = createOpenaireId(40, originalId, true); res - .add( - getRelation( - docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info, - lastUpdateTimestamp)); + .add( + getRelation( + docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info, + lastUpdateTimestamp)); res - .add( - getRelation( - projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info, - lastUpdateTimestamp)); + .add( + getRelation( + projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info, + lastUpdateTimestamp)); } } @@ -212,7 +212,7 @@ public abstract class AbstractMdRecordToOafMapper { } protected Relation getRelation(String source, String target, String relType, String subRelType, String relClass, - KeyValue collectedFrom, DataInfo info, long lastUpdateTimestamp) { + KeyValue collectedFrom, DataInfo info, long lastUpdateTimestamp) { final Relation rel = new Relation(); rel.setRelType(relType); rel.setSubRelType(subRelType); @@ -226,27 +226,27 @@ public abstract class AbstractMdRecordToOafMapper { } protected abstract List addOtherResultRels( - final Document doc, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp); + final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp); private void populateResultFields( - final Result r, - final Document doc, - final KeyValue collectedFrom, - final KeyValue hostedBy, - final DataInfo info, - final long lastUpdateTimestamp) { + final Result r, + final Document doc, + final KeyValue collectedFrom, + final KeyValue hostedBy, + final DataInfo info, + final long lastUpdateTimestamp) { r.setDataInfo(info); r.setLastupdatetimestamp(lastUpdateTimestamp); r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); r.setCollectedfrom(Arrays.asList(collectedFrom)); r - .setPid( - prepareListStructProps( - doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); + .setPid( + prepareListStructProps( + doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation")); r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES @@ -289,7 +289,7 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); protected abstract List prepareInstances( - Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); + Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); protected abstract List> prepareSources(Document doc, DataInfo info); @@ -314,13 +314,13 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract List prepareAuthors(Document doc, DataInfo info); protected abstract List> prepareOtherResearchProductTools( - Document doc, DataInfo info); + Document doc, DataInfo info); protected abstract List> prepareOtherResearchProductContactGroups( - Document doc, DataInfo info); + Document doc, DataInfo info); protected abstract List> prepareOtherResearchProductContactPersons( - Document doc, DataInfo info); + Document doc, DataInfo info); protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info); @@ -329,7 +329,7 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract List prepareSoftwareLicenses(Document doc, DataInfo info); protected abstract List> prepareSoftwareDocumentationUrls( - Document doc, DataInfo info); + Document doc, DataInfo info); protected abstract List prepareDatasetGeoLocations(Document doc, DataInfo info); @@ -359,37 +359,37 @@ public abstract class AbstractMdRecordToOafMapper { final String edition = n.valueOf("@edition"); if (StringUtils.isNotBlank(name)) { return journal( - name, - issnPrinted, - issnOnline, - issnLinking, - ep, - iss, - sp, - vol, - edition, - null, - null, - info); + name, + issnPrinted, + issnOnline, + issnLinking, + ep, + iss, + sp, + vol, + edition, + null, + null, + info); } } return null; } protected Qualifier prepareQualifier( - final Node node, final String xpath, final String schemeId, final String schemeName) { + final Node node, final String xpath, final String schemeId, final String schemeName) { final String classId = node.valueOf(xpath); final String className = code2name.get(classId); return qualifier(classId, className, schemeId, schemeName); } protected List prepareListStructProps( - final Node node, - final String xpath, - final String xpathClassId, - final String schemeId, - final String schemeName, - final DataInfo info) { + final Node node, + final String xpath, + final String xpathClassId, + final String schemeId, + final String schemeName, + final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; @@ -401,7 +401,7 @@ public abstract class AbstractMdRecordToOafMapper { } protected List prepareListStructProps( - final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) { + final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; @@ -411,19 +411,19 @@ public abstract class AbstractMdRecordToOafMapper { } protected List prepareListStructProps( - final Node node, final String xpath, final DataInfo info) { + final Node node, final String xpath, final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; res - .add( - structuredProperty( - n.getText(), - n.valueOf("@classid"), - n.valueOf("@classname"), - n.valueOf("@schemeid"), - n.valueOf("@schemename"), - info)); + .add( + structuredProperty( + n.getText(), + n.valueOf("@classid"), + n.valueOf("@classname"), + n.valueOf("@schemeid"), + n.valueOf("@schemename"), + info)); } return res; } @@ -450,7 +450,7 @@ public abstract class AbstractMdRecordToOafMapper { if (n == null) { return dataInfo( - false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); + false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); } final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); @@ -464,12 +464,12 @@ public abstract class AbstractMdRecordToOafMapper { final String trust = n.valueOf("./oaf:trust"); return dataInfo( - deletedbyinference, - inferenceprovenance, - inferred, - false, - qualifier(paClassId, paClassName, paSchemeId, paSchemeName), - trust); + deletedbyinference, + inferenceprovenance, + inferred, + false, + qualifier(paClassId, paClassName, paSchemeId, paSchemeName), + trust); } protected Field prepareField(final Node node, final String xpath, final DataInfo info) { @@ -477,7 +477,7 @@ public abstract class AbstractMdRecordToOafMapper { } protected List> prepareListFields( - final Node node, final String xpath, final DataInfo info) { + final Node node, final String xpath, final DataInfo info) { return listFields(info, prepareListString(node, xpath)); } From ab37953332755ed53ebf95655547a4736d8f7395 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 14 May 2020 10:25:41 +0200 Subject: [PATCH 03/23] added global properties in wf definitions to avoid repeating name-node and job-tracker in the (many) distcp actions; reintroduced output directory removal at the beginning of each spark action --- .../dnetlib/dhp/bulktag/SparkBulkTagJob.java | 2 + .../SparkCountryPropagationJob.java | 17 +- .../PrepareResultOrcidAssociationStep1.java | 4 +- .../PrepareResultOrcidAssociationStep2.java | 4 +- .../SparkOrcidToResultFromSemRelJob.java | 7 +- .../PrepareProjectResultsAssociation.java | 2 + .../PrepareResultCommunitySet.java | 4 +- ...kResultToCommunityFromOrganizationJob.java | 7 +- .../PrepareResultInstRepoAssociation.java | 43 +++-- ...arkResultToOrganizationFromIstRepoJob.java | 7 +- .../dhp/bulktag/oozie_app/workflow.xml | 27 ++- .../countrypropagation/oozie_app/workflow.xml | 20 ++- .../oozie_app/workflow.xml | 7 +- .../projecttoresult/oozie_app/workflow.xml | 29 ++-- .../oozie_app/workflow.xml | 29 ++-- .../oozie_app/workflow.xml | 52 +++--- .../raw/AbstractMdRecordToOafMapper.java | 164 +++++++++--------- 17 files changed, 213 insertions(+), 212 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java index 75d85e2ba..1c65e8ade 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.bulktag; +import static eu.dnetlib.dhp.PropagationConstant.removeOutputDir; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.util.Optional; @@ -84,6 +85,7 @@ public class SparkBulkTagJob { conf, isSparkSessionManaged, spark -> { + removeOutputDir(spark, outputPath); execBulkTag(spark, inputPath, outputPath, protoMappingParams, resultClazz, cc); }); } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java index 9dc17701b..974b3a3b1 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java @@ -69,13 +69,16 @@ public class SparkCountryPropagationJob { runWithSparkSession( conf, isSparkSessionManaged, - spark -> execPropagation( - spark, - sourcePath, - preparedInfoPath, - outputPath, - resultClazz, - saveGraph)); + spark -> { + removeOutputDir(spark, outputPath); + execPropagation( + spark, + sourcePath, + preparedInfoPath, + outputPath, + resultClazz, + saveGraph); + }); } private static void execPropagation( diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java index 3e16b4b4b..400c8d8ef 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java @@ -74,9 +74,7 @@ public class PrepareResultOrcidAssociationStep1 { conf, isSparkSessionManaged, spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } + removeOutputDir(spark, outputPath); prepareInfo( spark, inputRelationPath, inputResultPath, outputResultPath, resultClazz, allowedsemrel); }); diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java index 65d8811bc..2cea32e58 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java @@ -50,9 +50,7 @@ public class PrepareResultOrcidAssociationStep2 { conf, isSparkSessionManaged, spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } + removeOutputDir(spark, outputPath); mergeInfo(spark, inputPath, outputPath); }); } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java index ebb75a5a6..b34b29c48 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java @@ -70,11 +70,10 @@ public class SparkOrcidToResultFromSemRelJob { conf, isSparkSessionManaged, spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } - if (saveGraph) + removeOutputDir(spark, outputPath); + if (saveGraph) { execPropagation(spark, possibleUpdates, inputPath, outputPath, resultClazz); + } }); } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java index 05dcdc692..c27da4258 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java @@ -60,6 +60,8 @@ public class PrepareProjectResultsAssociation { conf, isSparkSessionManaged, spark -> { + removeOutputDir(spark, potentialUpdatePath); + removeOutputDir(spark, alreadyLinkedPath); prepareResultProjProjectResults( spark, inputPath, diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java index e2d4d5687..90eb54e5f 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java @@ -55,9 +55,7 @@ public class PrepareResultCommunitySet { conf, isSparkSessionManaged, spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } + removeOutputDir(spark, outputPath); prepareInfo(spark, inputPath, outputPath, organizationMap); }); } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java index 71275cc7f..66297e177 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java @@ -68,11 +68,10 @@ public class SparkResultToCommunityFromOrganizationJob { conf, isSparkSessionManaged, spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } - if (saveGraph) + removeOutputDir(spark, outputPath); + if (saveGraph) { execPropagation(spark, inputPath, outputPath, resultClazz, possibleupdatespath); + } }); } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java index f8fe1668f..5f549be53 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java @@ -58,30 +58,15 @@ public class PrepareResultInstRepoAssociation { isSparkSessionManaged, spark -> { readNeededResources(spark, inputPath); + + removeOutputDir(spark, datasourceOrganizationPath); prepareDatasourceOrganization(spark, datasourceOrganizationPath); + + removeOutputDir(spark, alreadyLinkedPath); prepareAlreadyLinkedAssociation(spark, alreadyLinkedPath); }); } - private static void prepareAlreadyLinkedAssociation( - SparkSession spark, String alreadyLinkedPath) { - String query = "Select source resultId, collect_set(target) organizationSet " - + "from relation " - + "where datainfo.deletedbyinference = false " - + "and relClass = '" - + RELATION_RESULT_ORGANIZATION_REL_CLASS - + "' " - + "group by source"; - - spark - .sql(query) - .as(Encoders.bean(ResultOrganizationSet.class)) - // TODO retry to stick with datasets - .toJavaRDD() - .map(r -> OBJECT_MAPPER.writeValueAsString(r)) - .saveAsTextFile(alreadyLinkedPath, GzipCodec.class); - } - private static void readNeededResources(SparkSession spark, String inputPath) { Dataset datasource = readPath(spark, inputPath + "/datasource", Datasource.class); datasource.createOrReplaceTempView("datasource"); @@ -119,4 +104,24 @@ public class PrepareResultInstRepoAssociation { .option("compression", "gzip") .json(datasourceOrganizationPath); } + + private static void prepareAlreadyLinkedAssociation( + SparkSession spark, String alreadyLinkedPath) { + String query = "Select source resultId, collect_set(target) organizationSet " + + "from relation " + + "where datainfo.deletedbyinference = false " + + "and relClass = '" + + RELATION_RESULT_ORGANIZATION_REL_CLASS + + "' " + + "group by source"; + + spark + .sql(query) + .as(Encoders.bean(ResultOrganizationSet.class)) + // TODO retry to stick with datasets + .toJavaRDD() + .map(r -> OBJECT_MAPPER.writeValueAsString(r)) + .saveAsTextFile(alreadyLinkedPath, GzipCodec.class); + } + } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java index 86634d43f..13577fa7c 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java @@ -83,10 +83,8 @@ public class SparkResultToOrganizationFromIstRepoJob { conf, isSparkSessionManaged, spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } - if (saveGraph) + removeOutputDir(spark, outputPath); + if (saveGraph) { execPropagation( spark, datasourceorganization, @@ -94,6 +92,7 @@ public class SparkResultToOrganizationFromIstRepoJob { inputPath, outputPath, resultClazz); + } }); } diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml index 754aba4f2..f019f8413 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml @@ -18,6 +18,17 @@ + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + @@ -42,8 +53,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/relation ${nameNode}/${outputPath}/relation @@ -53,8 +62,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/organization ${nameNode}/${outputPath}/organization @@ -64,8 +71,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/project ${nameNode}/${outputPath}/project @@ -75,8 +80,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/datasource ${nameNode}/${outputPath}/datasource @@ -95,8 +98,6 @@ - ${jobTracker} - ${nameNode} yarn-cluster cluster bulkTagging-publication @@ -124,8 +125,6 @@ - ${jobTracker} - ${nameNode} yarn-cluster cluster bulkTagging-dataset @@ -153,8 +152,6 @@ - ${jobTracker} - ${nameNode} yarn-cluster cluster bulkTagging-orp @@ -182,8 +179,6 @@ - ${jobTracker} - ${nameNode} yarn-cluster cluster bulkTagging-software diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml index fc877071d..85116e4cc 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml @@ -19,6 +19,17 @@ + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + @@ -43,8 +54,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/relation ${nameNode}/${outputPath}/relation @@ -54,18 +63,15 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/organization ${nameNode}/${outputPath}/organization + - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/project ${nameNode}/${outputPath}/project @@ -75,8 +81,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/datasource ${nameNode}/${outputPath}/datasource diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml index e4429b710..5ddc5fedf 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml @@ -57,6 +57,7 @@ + ${jobTracker} @@ -81,7 +82,6 @@ - @@ -230,8 +230,8 @@ - + @@ -271,6 +271,7 @@ + yarn @@ -302,6 +303,7 @@ + yarn @@ -333,6 +335,7 @@ + yarn diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml index 24e1d3b7f..9e91c06fb 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml @@ -14,6 +14,17 @@ + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + @@ -42,8 +53,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/relation ${nameNode}/${outputPath}/relation @@ -53,8 +62,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/publication ${nameNode}/${outputPath}/publication @@ -64,8 +71,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/dataset ${nameNode}/${outputPath}/dataset @@ -75,8 +80,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/otherresearchproduct ${nameNode}/${outputPath}/otherresearchproduct @@ -86,28 +89,24 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/software ${nameNode}/${outputPath}/software + - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/organization ${nameNode}/${outputPath}/organization + - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/project ${nameNode}/${outputPath}/project @@ -117,8 +116,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/datasource ${nameNode}/${outputPath}/datasource diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/workflow.xml index d481cad05..6a329fdc4 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/workflow.xml @@ -14,6 +14,17 @@ + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + @@ -38,8 +49,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/relation ${nameNode}/${outputPath}/relation @@ -49,8 +58,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/organization ${nameNode}/${outputPath}/organization @@ -60,8 +67,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/project ${nameNode}/${outputPath}/project @@ -71,8 +76,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/datasource ${nameNode}/${outputPath}/datasource @@ -101,8 +104,8 @@ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --sourcePath${sourcePath}/relation - --hive_metastore_uris${hive_metastore_uris} --outputPath${workingDir}/preparedInfo/resultCommunityList + --hive_metastore_uris${hive_metastore_uris} --organizationtoresultcommunitymap${organizationtoresultcommunitymap} @@ -136,9 +139,9 @@ --preparedInfoPath${workingDir}/preparedInfo/resultCommunityList --sourcePath${sourcePath}/publication + --outputPath${outputPath}/publication --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication - --outputPath${outputPath}/publication --saveGraph${saveGraph} @@ -165,9 +168,9 @@ --preparedInfoPath${workingDir}/preparedInfo/resultCommunityList --sourcePath${sourcePath}/dataset + --outputPath${outputPath}/dataset --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset - --outputPath${outputPath}/dataset --saveGraph${saveGraph} @@ -194,9 +197,9 @@ --preparedInfoPath${workingDir}/preparedInfo/resultCommunityList --sourcePath${sourcePath}/otherresearchproduct + --outputPath${outputPath}/otherresearchproduct --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --outputPath${outputPath}/otherresearchproduct --saveGraph${saveGraph} @@ -223,9 +226,9 @@ --preparedInfoPath${workingDir}/preparedInfo/resultCommunityList --sourcePath${sourcePath}/software + --outputPath${outputPath}/software --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.Software - --outputPath${outputPath}/software --saveGraph${saveGraph} diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml index a1b7f4ad7..e0563abae 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml @@ -10,6 +10,17 @@ + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + @@ -38,8 +49,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/relation ${nameNode}/${outputPath}/relation @@ -49,8 +58,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/publication ${nameNode}/${outputPath}/publication @@ -60,8 +67,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/dataset ${nameNode}/${outputPath}/dataset @@ -71,8 +76,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/otherresearchproduct ${nameNode}/${outputPath}/otherresearchproduct @@ -82,8 +85,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/software ${nameNode}/${outputPath}/software @@ -93,8 +94,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/organization ${nameNode}/${outputPath}/organization @@ -104,8 +103,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/project ${nameNode}/${outputPath}/project @@ -115,8 +112,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/datasource ${nameNode}/${outputPath}/datasource @@ -125,6 +120,7 @@ + yarn @@ -176,12 +172,12 @@ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --sourcePath${sourcePath}/publication - --hive_metastore_uris${hive_metastore_uris} - --saveGraph${saveGraph} - --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication --outputPath${outputPath}/relation --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + --hive_metastore_uris${hive_metastore_uris} + --saveGraph${saveGraph} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication @@ -206,12 +202,12 @@ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --sourcePath${sourcePath}/dataset - --hive_metastore_uris${hive_metastore_uris} - --saveGraph${saveGraph} - --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset --outputPath${outputPath}/relation --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + --hive_metastore_uris${hive_metastore_uris} + --saveGraph${saveGraph} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset @@ -236,12 +232,12 @@ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --sourcePath${sourcePath}/otherresearchproduct - --hive_metastore_uris${hive_metastore_uris} - --saveGraph${saveGraph} - --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct --outputPath${outputPath}/relation --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + --hive_metastore_uris${hive_metastore_uris} + --saveGraph${saveGraph} + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct @@ -266,12 +262,12 @@ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --sourcePath${sourcePath}/software - --hive_metastore_uris${hive_metastore_uris} - --saveGraph${saveGraph} - --resultTableNameeu.dnetlib.dhp.schema.oaf.Software --outputPath${outputPath}/relation --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + --hive_metastore_uris${hive_metastore_uris} + --saveGraph${saveGraph} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index be0b91022..b9c4e6c80 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -64,7 +64,7 @@ public abstract class AbstractMdRecordToOafMapper { } protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier( - "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); + "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); protected AbstractMdRecordToOafMapper(final Map code2name) { this.code2name = code2name; @@ -75,20 +75,20 @@ public abstract class AbstractMdRecordToOafMapper { DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); final Document doc = DocumentHelper - .parseText( - xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)); + .parseText( + xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)); final String type = doc.valueOf("//dr:CobjCategory/@type"); final KeyValue collectedFrom = getProvenanceDatasource( - doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); + doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); if (collectedFrom == null) { return null; } final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) - ? collectedFrom - : getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name"); + ? collectedFrom + : getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name"); if (hostedBy == null) { return null; @@ -112,17 +112,17 @@ public abstract class AbstractMdRecordToOafMapper { } return keyValue( - createOpenaireId(10, dsId, true), - dsName); + createOpenaireId(10, dsId, true), + dsName); } protected List createOafs( - final Document doc, - final String type, - final KeyValue collectedFrom, - final KeyValue hostedBy, - final DataInfo info, - final long lastUpdateTimestamp) { + final Document doc, + final String type, + final KeyValue collectedFrom, + final KeyValue hostedBy, + final DataInfo info, + final long lastUpdateTimestamp) { final List oafs = new ArrayList<>(); @@ -179,10 +179,10 @@ public abstract class AbstractMdRecordToOafMapper { } private List addProjectRels( - final Document doc, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { + final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { final List res = new ArrayList<>(); @@ -196,15 +196,15 @@ public abstract class AbstractMdRecordToOafMapper { final String projectId = createOpenaireId(40, originalId, true); res - .add( - getRelation( - docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info, - lastUpdateTimestamp)); + .add( + getRelation( + docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info, + lastUpdateTimestamp)); res - .add( - getRelation( - projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info, - lastUpdateTimestamp)); + .add( + getRelation( + projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info, + lastUpdateTimestamp)); } } @@ -212,7 +212,7 @@ public abstract class AbstractMdRecordToOafMapper { } protected Relation getRelation(String source, String target, String relType, String subRelType, String relClass, - KeyValue collectedFrom, DataInfo info, long lastUpdateTimestamp) { + KeyValue collectedFrom, DataInfo info, long lastUpdateTimestamp) { final Relation rel = new Relation(); rel.setRelType(relType); rel.setSubRelType(subRelType); @@ -226,27 +226,27 @@ public abstract class AbstractMdRecordToOafMapper { } protected abstract List addOtherResultRels( - final Document doc, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp); + final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp); private void populateResultFields( - final Result r, - final Document doc, - final KeyValue collectedFrom, - final KeyValue hostedBy, - final DataInfo info, - final long lastUpdateTimestamp) { + final Result r, + final Document doc, + final KeyValue collectedFrom, + final KeyValue hostedBy, + final DataInfo info, + final long lastUpdateTimestamp) { r.setDataInfo(info); r.setLastupdatetimestamp(lastUpdateTimestamp); r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); r.setCollectedfrom(Arrays.asList(collectedFrom)); r - .setPid( - prepareListStructProps( - doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); + .setPid( + prepareListStructProps( + doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation")); r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES @@ -289,7 +289,7 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); protected abstract List prepareInstances( - Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); + Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); protected abstract List> prepareSources(Document doc, DataInfo info); @@ -314,13 +314,13 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract List prepareAuthors(Document doc, DataInfo info); protected abstract List> prepareOtherResearchProductTools( - Document doc, DataInfo info); + Document doc, DataInfo info); protected abstract List> prepareOtherResearchProductContactGroups( - Document doc, DataInfo info); + Document doc, DataInfo info); protected abstract List> prepareOtherResearchProductContactPersons( - Document doc, DataInfo info); + Document doc, DataInfo info); protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info); @@ -329,7 +329,7 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract List prepareSoftwareLicenses(Document doc, DataInfo info); protected abstract List> prepareSoftwareDocumentationUrls( - Document doc, DataInfo info); + Document doc, DataInfo info); protected abstract List prepareDatasetGeoLocations(Document doc, DataInfo info); @@ -359,37 +359,37 @@ public abstract class AbstractMdRecordToOafMapper { final String edition = n.valueOf("@edition"); if (StringUtils.isNotBlank(name)) { return journal( - name, - issnPrinted, - issnOnline, - issnLinking, - ep, - iss, - sp, - vol, - edition, - null, - null, - info); + name, + issnPrinted, + issnOnline, + issnLinking, + ep, + iss, + sp, + vol, + edition, + null, + null, + info); } } return null; } protected Qualifier prepareQualifier( - final Node node, final String xpath, final String schemeId, final String schemeName) { + final Node node, final String xpath, final String schemeId, final String schemeName) { final String classId = node.valueOf(xpath); final String className = code2name.get(classId); return qualifier(classId, className, schemeId, schemeName); } protected List prepareListStructProps( - final Node node, - final String xpath, - final String xpathClassId, - final String schemeId, - final String schemeName, - final DataInfo info) { + final Node node, + final String xpath, + final String xpathClassId, + final String schemeId, + final String schemeName, + final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; @@ -401,7 +401,7 @@ public abstract class AbstractMdRecordToOafMapper { } protected List prepareListStructProps( - final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) { + final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; @@ -411,19 +411,19 @@ public abstract class AbstractMdRecordToOafMapper { } protected List prepareListStructProps( - final Node node, final String xpath, final DataInfo info) { + final Node node, final String xpath, final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; res - .add( - structuredProperty( - n.getText(), - n.valueOf("@classid"), - n.valueOf("@classname"), - n.valueOf("@schemeid"), - n.valueOf("@schemename"), - info)); + .add( + structuredProperty( + n.getText(), + n.valueOf("@classid"), + n.valueOf("@classname"), + n.valueOf("@schemeid"), + n.valueOf("@schemename"), + info)); } return res; } @@ -450,7 +450,7 @@ public abstract class AbstractMdRecordToOafMapper { if (n == null) { return dataInfo( - false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); + false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); } final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); @@ -464,12 +464,12 @@ public abstract class AbstractMdRecordToOafMapper { final String trust = n.valueOf("./oaf:trust"); return dataInfo( - deletedbyinference, - inferenceprovenance, - inferred, - false, - qualifier(paClassId, paClassName, paSchemeId, paSchemeName), - trust); + deletedbyinference, + inferenceprovenance, + inferred, + false, + qualifier(paClassId, paClassName, paSchemeId, paSchemeName), + trust); } protected Field prepareField(final Node node, final String xpath, final DataInfo info) { @@ -477,7 +477,7 @@ public abstract class AbstractMdRecordToOafMapper { } protected List> prepareListFields( - final Node node, final String xpath, final DataInfo info) { + final Node node, final String xpath, final DataInfo info) { return listFields(info, prepareListString(node, xpath)); } From 8828458acfd38e47b03c1a88335b4b5bc69c9eab Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 14 May 2020 10:34:12 +0200 Subject: [PATCH 04/23] minor changes --- .../PrepareResultOrcidAssociationStep1.java | 33 ++++++++++--------- .../SparkOrcidToResultFromSemRelJob.java | 22 +++++++------ 2 files changed, 29 insertions(+), 26 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java index 3e16b4b4b..7cd057cf3 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java @@ -97,22 +97,23 @@ public class PrepareResultOrcidAssociationStep1 { Dataset result = readPath(spark, inputResultPath, resultClazz); result.createOrReplaceTempView("result"); - String query = " select target resultId, author authorList" - + " from (select id, collect_set(named_struct('name', name, 'surname', surname, 'fullname', fullname, 'orcid', orcid)) author " - + " from ( " - + " select id, MyT.fullname, MyT.name, MyT.surname, MyP.value orcid " - + " from result " - + " lateral view explode (author) a as MyT " - + " lateral view explode (MyT.pid) p as MyP " - + " where MyP.qualifier.classid = 'ORCID') tmp " - + " group by id) r_t " - + " join (" - + " select source, target " - + " from relation " - + " where datainfo.deletedbyinference = false " - + getConstraintList(" relclass = '", allowedsemrel) - + ") rel_rel " - + " on source = id"; + String query = + "SELECT target resultId, author authorList" + + " FROM (SELECT id, collect_set(named_struct('name', name, 'surname', surname, 'fullname', fullname, 'orcid', orcid)) author " + + " FROM ( " + + " SELECT DISTINCT id, MyT.fullname, MyT.name, MyT.surname, MyP.value orcid " + + " FROM result " + + " LATERAL VIEW EXPLODE (author) a AS MyT " + + " LATERAL VIEW EXPLODE (MyT.pid) p AS MyP " + + " WHERE MyP.qualifier.classid = 'ORCID') tmp " + + " GROUP BY id) r_t " + + " JOIN (" + + " SELECT source, target " + + " FROM relation " + + " WHERE datainfo.deletedbyinference = false " + + getConstraintList(" relclass = '", allowedsemrel) + + " ) rel_rel " + + " ON source = id"; spark .sql(query) .as(Encoders.bean(ResultOrcidList.class)) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java index ebb75a5a6..b93b66d9f 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java @@ -132,16 +132,16 @@ public class SparkOrcidToResultFromSemRelJob { private static boolean enrichAuthor(AutoritativeAuthor autoritative_author, Author author) { boolean toaddpid = false; - if (StringUtils.isNoneEmpty(autoritative_author.getSurname())) { - if (StringUtils.isNoneEmpty(author.getSurname())) { + if (StringUtils.isNotEmpty(autoritative_author.getSurname())) { + if (StringUtils.isNotEmpty(author.getSurname())) { if (autoritative_author .getSurname() .trim() .equalsIgnoreCase(author.getSurname().trim())) { // have the same surname. Check the name - if (StringUtils.isNoneEmpty(autoritative_author.getName())) { - if (StringUtils.isNoneEmpty(author.getName())) { + if (StringUtils.isNotEmpty(autoritative_author.getName())) { + if (StringUtils.isNotEmpty(author.getName())) { if (autoritative_author .getName() .trim() @@ -150,12 +150,14 @@ public class SparkOrcidToResultFromSemRelJob { } // they could be differently written (i.e. only the initials of the name // in one of the two - if (autoritative_author - .getName() - .trim() - .substring(0, 0) - .equalsIgnoreCase(author.getName().trim().substring(0, 0))) { - toaddpid = true; + else { + if (autoritative_author + .getName() + .trim() + .substring(0, 0) + .equalsIgnoreCase(author.getName().trim().substring(0, 0))) { + toaddpid = true; + } } } } From f044d093156c3c29cf00a4a9b498459885ebcdd0 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 14 May 2020 15:07:24 +0200 Subject: [PATCH 05/23] revised mapping: more accurate mapping for name/surname from datacite format; improved mapping of null values --- .../migration/ProtoConverter.java | 27 ++++++++++++----- .../raw/MigrateDbEntitiesApplication.java | 20 +++++-------- .../dhp/oa/graph/raw/OdfToOafMapper.java | 30 +++++++++++++++---- .../dhp/oa/graph/raw/common/PacePerson.java | 1 - 4 files changed, 51 insertions(+), 27 deletions(-) diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java index 90d573ac0..e55c0eb7b 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java @@ -523,7 +523,9 @@ public class ProtoConverter implements Serializable { } private static Context mapContext(ResultProtos.Result.Context context) { - + if (context == null || StringUtils.isBlank(context.getId())) { + return null; + } final Context entity = new Context(); entity.setId(context.getId()); entity @@ -537,6 +539,10 @@ public class ProtoConverter implements Serializable { } public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) { + if (kv == null || StringUtils.isBlank(kv.getKey()) & StringUtils.isBlank(kv.getValue())) { + return null; + } + final KeyValue keyValue = new KeyValue(); keyValue.setKey(kv.getKey()); keyValue.setValue(kv.getValue()); @@ -575,6 +581,10 @@ public class ProtoConverter implements Serializable { } public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) { + if (sp == null | StringUtils.isBlank(sp.getValue())) { + return null; + } + final StructuredProperty structuredProperty = new StructuredProperty(); structuredProperty.setValue(sp.getValue()); structuredProperty.setQualifier(mapQualifier(sp.getQualifier())); @@ -611,6 +621,10 @@ public class ProtoConverter implements Serializable { } public static Field mapStringField(FieldTypeProtos.StringField s) { + if (s == null || StringUtils.isBlank(s.getValue())) { + return null; + } + final Field stringField = new Field<>(); stringField.setValue(s.getValue()); stringField.setDataInfo(mapDataInfo(s.getDataInfo())); @@ -618,19 +632,16 @@ public class ProtoConverter implements Serializable { } public static Field mapBoolField(FieldTypeProtos.BoolField b) { + if (b == null) { + return null; + } + final Field booleanField = new Field<>(); booleanField.setValue(b.getValue()); booleanField.setDataInfo(mapDataInfo(b.getDataInfo())); return booleanField; } - public static Field mapIntField(FieldTypeProtos.IntField b) { - final Field entity = new Field<>(); - entity.setValue(b.getValue()); - entity.setDataInfo(mapDataInfo(b.getDataInfo())); - return entity; - } - public static Journal mapJournal(FieldTypeProtos.Journal j) { final Journal journal = new Journal(); journal.setConferencedate(j.getConferencedate()); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index e5e348642..ebe2b703b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -50,8 +50,7 @@ import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -public class MigrateDbEntitiesApplication extends AbstractMigrationApplication - implements Closeable { +public class MigrateDbEntitiesApplication extends AbstractMigrationApplication implements Closeable { private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class); @@ -128,9 +127,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication } public List processDatasource(final ResultSet rs) { - try { - final DataInfo info = prepareDataInfo(rs); final Datasource ds = new Datasource(); @@ -194,7 +191,6 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication public List processProject(final ResultSet rs) { try { - final DataInfo info = prepareDataInfo(rs); final Project p = new Project(); @@ -249,9 +245,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication } public List processOrganization(final ResultSet rs) { - try { - final DataInfo info = prepareDataInfo(rs); final Organization o = new Organization(); @@ -370,14 +364,12 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication final DataInfo info = dataInfo( false, null, false, false, - qualifier(USER_CLAIM, USER_CLAIM, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), "0.9"); final List collectedFrom = listKeyValues( createOpenaireId(10, "infrastruct_::openaire", true), "OpenAIRE"); try { - if (rs.getString(SOURCE_TYPE).equals("context")) { final Result r; @@ -461,9 +453,12 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication final Boolean inferred = rs.getBoolean("inferred"); final String trust = rs.getString("trust"); return dataInfo( - - deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, trust); - + deletedbyinference, + inferenceprovenance, + inferred, + false, + ENTITYREGISTRY_PROVENANCE_ACTION, + trust); } private Qualifier prepareQualifierSplitting(final String s) { @@ -535,4 +530,5 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication super.close(); dbClient.close(); } + } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 04984d008..5baac12fd 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -12,6 +12,7 @@ import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.Node; +import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.DataInfo; @@ -44,9 +45,24 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { for (final Object o : doc.selectNodes("//datacite:creator")) { final Node n = (Node) o; final Author author = new Author(); - author.setFullname(n.valueOf("./datacite:creatorName")); - author.setName(n.valueOf("./datacite:givenName")); - author.setSurname(n.valueOf("./datacite:familyName")); + final String fullname = n.valueOf("./datacite:creatorName"); + author.setFullname(fullname); + + PacePerson pp = new PacePerson(fullname, false); + final String name = n.valueOf("./datacite:givenName"); + if (StringUtils.isBlank(name) & pp.isAccurate()) { + author.setName(pp.getNormalisedFirstName()); + } else { + author.setName(name); + } + + final String surname = n.valueOf("./datacite:familyName"); + if (StringUtils.isBlank(surname) & pp.isAccurate()) { + author.setSurname(pp.getNormalisedSurname()); + } else { + author.setSurname(surname); + } + author.setAffiliation(prepareListFields(doc, "./datacite:affiliation", info)); author.setPid(preparePids(doc, info)); author.setRank(pos++); @@ -77,8 +93,6 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final KeyValue hostedby) { final Instance instance = new Instance(); - final Set url = new HashSet<>(); - instance.setUrl(new ArrayList<>()); instance .setInstancetype( prepareQualifier( @@ -97,6 +111,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { .setProcessingchargecurrency( field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); + final Set url = new HashSet<>(); for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) { url.add(((Node) o).getText().trim()); } @@ -109,7 +124,10 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { for (final Object o : doc.selectNodes("//datacite:identifier[@identifierType='DOI']")) { url.add(HTTP_DX_DOI_PREIFX + ((Node) o).getText().trim()); } - instance.getUrl().addAll(url); + if (!url.isEmpty()) { + instance.setUrl(new ArrayList<>()); + instance.getUrl().addAll(url); + } return Arrays.asList(instance); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java index d1c615dcd..6e474f2f3 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java @@ -1,7 +1,6 @@ package eu.dnetlib.dhp.oa.graph.raw.common; -import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.text.Normalizer; import java.util.HashSet; From 42085e8d99220ccdf1f4c9cc38d26db24b9544a2 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 14 May 2020 18:22:28 +0200 Subject: [PATCH 06/23] added some constants --- .../java/eu/dnetlib/dhp/schema/common/ModelConstants.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java index accc06d12..e32dd10fa 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java @@ -13,6 +13,7 @@ public class ModelConstants { public static final String DNET_DATA_CITE_DATE = "dnet:dataCite_date"; public static final String DNET_DATA_CITE_RESOURCE = "dnet:dataCite_resource"; public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions"; + public static final String DNET_COUNTRY_TYPE = "dnet:countries"; public static final String SYSIMPORT_CROSSWALK_REPOSITORY = "sysimport:crosswalk:repository"; public static final String SYSIMPORT_CROSSWALK_ENTITYREGISTRY = "sysimport:crosswalk:entityregistry"; @@ -49,6 +50,13 @@ public class ModelConstants { public static final String HAS_PARTICIPANT = "hasParticipant"; public static final String IS_PARTICIPANT = "isParticipant"; + public static final String RESULT_ORGANIZATION = "resultOrganization"; + public static final String AFFILIATION = "affiliation"; + public static final String IS_AUTHOR_INSTITUTION_OF = "isAuthorInstitutionOf"; + public static final String HAS_AUTHOR_INSTITUTION = "hasAuthorInstitution"; + + public static final String MERGES = "merges"; + public static final String UNKNOWN = "UNKNOWN"; public static final String NOT_AVAILABLE = "not available"; From d05630d9795f1a26a9ea9ce33d2337c9156217ff Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 14 May 2020 18:22:50 +0200 Subject: [PATCH 07/23] removed the constants added in ModelConstants --- .../eu/dnetlib/dhp/PropagationConstant.java | 30 ++++--------------- 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java index 8d2fede82..13ed46508 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java @@ -4,6 +4,8 @@ package eu.dnetlib.dhp; import java.util.List; import java.util.Optional; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.common.ModelSupport; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -24,10 +26,6 @@ public class PropagationConstant { public static final String TRUE = "true"; - public static final String DNET_COUNTRY_SCHEMA = "dnet:countries"; - public static final String DNET_SCHEMA_NAME = "dnet:provenanceActions"; - public static final String DNET_SCHEMA_ID = "dnet:provenanceActions"; - public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_ID = "country:instrepos"; public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME = "Propagation of country to result collected from datasources of type institutional repositories"; @@ -46,22 +44,6 @@ public class PropagationConstant { public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID = "authorpid:result"; public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME = "Propagation of authors pid to result through semantic relations"; - public static final String RELATION_DATASOURCE_ORGANIZATION_REL_CLASS = "isProvidedBy"; - - public static final String RELATION_RESULTORGANIZATION_REL_TYPE = "resultOrganization"; - public static final String RELATION_RESULTORGANIZATION_SUBREL_TYPE = "affiliation"; - public static final String RELATION_ORGANIZATION_RESULT_REL_CLASS = "isAuthorInstitutionOf"; - public static final String RELATION_RESULT_ORGANIZATION_REL_CLASS = "hasAuthorInstitution"; - - public static final String RELATION_RESULTRESULT_REL_TYPE = "resultResult"; - - public static final String RELATION_RESULTPROJECT_REL_TYPE = "resultProject"; - public static final String RELATION_RESULTPROJECT_SUBREL_TYPE = "outcome"; - public static final String RELATION_RESULT_PROJECT_REL_CLASS = "isProducedBy"; - public static final String RELATION_PROJECT_RESULT_REL_CLASS = "produces"; - - public static final String RELATION_REPRESENTATIVERESULT_RESULT_CLASS = "merges"; - public static final String PROPAGATION_AUTHOR_PID = "ORCID"; public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -76,8 +58,8 @@ public class PropagationConstant { Country nc = new Country(); nc.setClassid(classid); nc.setClassname(classname); - nc.setSchemename(DNET_COUNTRY_SCHEMA); - nc.setSchemeid(DNET_COUNTRY_SCHEMA); + nc.setSchemename(ModelConstants.DNET_COUNTRY_TYPE); + nc.setSchemeid(ModelConstants.DNET_COUNTRY_TYPE); nc .setDataInfo( getDataInfo( @@ -102,8 +84,8 @@ public class PropagationConstant { Qualifier pa = new Qualifier(); pa.setClassid(inference_class_id); pa.setClassname(inference_class_name); - pa.setSchemeid(DNET_SCHEMA_ID); - pa.setSchemename(DNET_SCHEMA_NAME); + pa.setSchemeid(ModelConstants.DNET_PID_TYPES); + pa.setSchemename(ModelConstants.DNET_PID_TYPES); return pa; } From f25db01664eb56d2250d00e95822b4aaacaf52bf Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 14 May 2020 18:29:24 +0200 Subject: [PATCH 08/23] changed in the constant from propagationconstants to modelconstants --- .../PrepareDatasourceCountryAssociation.java | 3 ++- .../PrepareProjectResultsAssociation.java | 3 ++- .../SparkResultToProjectThroughSemRelJob.java | 13 +++++++------ .../PrepareResultCommunitySet.java | 5 +++-- .../PrepareResultInstRepoAssociation.java | 5 +++-- .../SparkResultToOrganizationFromIstRepoJob.java | 13 +++++++------ 6 files changed, 24 insertions(+), 18 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java index e91a1e48a..1d01d35e5 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java @@ -7,6 +7,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.util.Arrays; import java.util.List; +import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; @@ -100,7 +101,7 @@ public class PrepareDatasourceCountryAssociation { + "JOIN ( SELECT source, target " + " FROM relation " + " WHERE relclass = '" - + RELATION_DATASOURCE_ORGANIZATION_REL_CLASS + + ModelConstants.IS_PROVIDED_BY + "' " + " AND datainfo.deletedbyinference = false ) rel " + "ON d.id = rel.source " diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java index c27da4258..920ef1f06 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java @@ -8,6 +8,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.util.Arrays; import java.util.List; +import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; @@ -85,7 +86,7 @@ public class PrepareProjectResultsAssociation { + " FROM relation " + " WHERE datainfo.deletedbyinference = false " + " AND relClass = '" - + RELATION_RESULT_PROJECT_REL_CLASS + + ModelConstants.IS_PRODUCED_BY + "'"; Dataset resproj_relation = spark.sql(resproj_relation_query); diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java index 36694b3dd..44a439ab7 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java @@ -9,6 +9,7 @@ import java.util.Iterator; import java.util.List; import java.util.Optional; +import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FlatMapFunction; @@ -122,9 +123,9 @@ public class SparkResultToProjectThroughSemRelJob { getRelation( resId, projectId, - RELATION_RESULT_PROJECT_REL_CLASS, - RELATION_RESULTPROJECT_REL_TYPE, - RELATION_RESULTPROJECT_SUBREL_TYPE, + ModelConstants.IS_PRODUCED_BY, + ModelConstants.RESULT_PROJECT , + ModelConstants.OUTCOME, PROPAGATION_DATA_INFO_TYPE, PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID, PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME)); @@ -133,9 +134,9 @@ public class SparkResultToProjectThroughSemRelJob { getRelation( projectId, resId, - RELATION_PROJECT_RESULT_REL_CLASS, - RELATION_RESULTPROJECT_REL_TYPE, - RELATION_RESULTPROJECT_SUBREL_TYPE, + ModelConstants.PRODUCES, + ModelConstants.RESULT_PROJECT, + ModelConstants.OUTCOME, PROPAGATION_DATA_INFO_TYPE, PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID, PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME)); diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java index 90eb54e5f..fc9f395d1 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java @@ -6,6 +6,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.util.*; +import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -74,13 +75,13 @@ public class PrepareResultCommunitySet { + " FROM relation " + " WHERE datainfo.deletedbyinference = false " + " AND relClass = '" - + RELATION_RESULT_ORGANIZATION_REL_CLASS + + ModelConstants.HAS_AUTHOR_INSTITUTION + "') result_organization " + "LEFT JOIN (SELECT source, collect_set(target) org_set " + " FROM relation " + " WHERE datainfo.deletedbyinference = false " + " AND relClass = '" - + RELATION_REPRESENTATIVERESULT_RESULT_CLASS + + ModelConstants.MERGES + "' " + " GROUP BY source) organization_organization " + "ON result_organization.target = organization_organization.source "; diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java index 5f549be53..0a83e4195 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java @@ -4,6 +4,7 @@ package eu.dnetlib.dhp.resulttoorganizationfrominstrepo; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; +import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; @@ -91,7 +92,7 @@ public class PrepareResultInstRepoAssociation { + "JOIN ( SELECT source, target " + "FROM relation " + "WHERE relclass = '" - + RELATION_DATASOURCE_ORGANIZATION_REL_CLASS + + ModelConstants.IS_PROVIDED_BY + "' " + "AND datainfo.deletedbyinference = false ) rel " + "ON d.id = rel.source "; @@ -111,7 +112,7 @@ public class PrepareResultInstRepoAssociation { + "from relation " + "where datainfo.deletedbyinference = false " + "and relClass = '" - + RELATION_RESULT_ORGANIZATION_REL_CLASS + + ModelConstants.HAS_AUTHOR_INSTITUTION + "' " + "group by source"; diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java index 13577fa7c..fc9a46d72 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java @@ -6,6 +6,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.util.*; +import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; @@ -150,9 +151,9 @@ public class SparkResultToOrganizationFromIstRepoJob { getRelation( orgId, resultId, - RELATION_ORGANIZATION_RESULT_REL_CLASS, - RELATION_RESULTORGANIZATION_REL_TYPE, - RELATION_RESULTORGANIZATION_SUBREL_TYPE, + ModelConstants.IS_AUTHOR_INSTITUTION_OF, + ModelConstants.RESULT_ORGANIZATION, + ModelConstants.AFFILIATION, PROPAGATION_DATA_INFO_TYPE, PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID, PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME)); @@ -161,9 +162,9 @@ public class SparkResultToOrganizationFromIstRepoJob { getRelation( resultId, orgId, - RELATION_RESULT_ORGANIZATION_REL_CLASS, - RELATION_RESULTORGANIZATION_REL_TYPE, - RELATION_RESULTORGANIZATION_SUBREL_TYPE, + ModelConstants.HAS_AUTHOR_INSTITUTION, + ModelConstants.RESULT_ORGANIZATION, + ModelConstants.AFFILIATION, PROPAGATION_DATA_INFO_TYPE, PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID, PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME)); From eb64335a547c1a8b9e708f50b6f33b362fa1e54e Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 09:05:26 +0200 Subject: [PATCH 09/23] parallel implementation for graph Hive importer --- .../graph/hive/GraphHiveTableImporterJob.java | 79 +++++++ .../hive/oozie_app/lib/scripts/reset_db.sql | 2 + .../dhp/oa/graph/hive/oozie_app/workflow.xml | 200 +++++++++++++++++- .../oa/graph/hive_db_importer_parameters.json | 26 +++ .../graph/hive_table_importer_parameters.json | 32 +++ 5 files changed, 331 insertions(+), 8 deletions(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/reset_db.sql create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_db_importer_parameters.json create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_table_importer_parameters.json diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java new file mode 100644 index 000000000..f88f7457f --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java @@ -0,0 +1,79 @@ + +package eu.dnetlib.dhp.oa.graph.hive; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; +import static eu.dnetlib.dhp.schema.common.ModelSupport.tableIdentifier; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Oaf; + +public class GraphHiveTableImporterJob { + + private static final Logger log = LoggerFactory.getLogger(GraphHiveTableImporterJob.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(String[] args) throws Exception { + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GraphHiveTableImporterJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/hive_table_importer_parameters.json"))); + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String inputPath = parser.get("inputPath"); + log.info("inputPath: {}", inputPath); + + String hiveDbName = parser.get("hiveDbName"); + log.info("hiveDbName: {}", hiveDbName); + + final String className = parser.get("className"); + log.info("className: {}", className); + + Class clazz = (Class) Class.forName(className); + + String hiveMetastoreUris = parser.get("hiveMetastoreUris"); + log.info("hiveMetastoreUris: {}", hiveMetastoreUris); + + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", hiveMetastoreUris); + + runWithSparkHiveSession( + conf, isSparkSessionManaged, spark -> loadGraphTable(spark, inputPath, hiveDbName, clazz)); + } + + // protected for testing + private static void loadGraphTable(SparkSession spark, String inputPath, String hiveDbName, + Class clazz) { + + spark + .read() + .textFile(inputPath) + .map((MapFunction) s -> OBJECT_MAPPER.readValue(s, clazz), Encoders.bean(clazz)) + .write() + .mode(SaveMode.Overwrite) + .saveAsTable(tableIdentifier(hiveDbName, clazz)); + } + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/reset_db.sql b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/reset_db.sql new file mode 100644 index 000000000..484afde80 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/reset_db.sql @@ -0,0 +1,2 @@ +DROP DATABASE IF EXISTS ${hiveDbName} CASCADE; +CREATE DATABASE ${hiveDbName}; \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml index e837ac6b3..2bcbbba5e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml @@ -72,18 +72,44 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + + + + + hive.metastore.uris + ${hiveMetastoreUris} + + + ${hiveJdbcUrl}/${hiveDbName} + + hiveDbName=${hiveDbName} + + + + + + + + + + + + + + + + yarn cluster - MapGraphAsHiveDB - eu.dnetlib.dhp.oa.graph.hive.GraphHiveImporterJob + Import table publication + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob dhp-graph-mapper-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} @@ -95,18 +121,175 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --inputPath${inputPath} + --inputPath${inputPath}/publication --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.Publication --hiveMetastoreUris${hiveMetastoreUris} - + + + + yarn + cluster + Import table dataset + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${inputPath}/dataset + --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.Dataset + --hiveMetastoreUris${hiveMetastoreUris} + + + + + + + + yarn + cluster + Import table otherresearchproduct + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${inputPath}/otherresearchproduct + --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --hiveMetastoreUris${hiveMetastoreUris} + + + + + + + + yarn + cluster + Import table software + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${inputPath}/software + --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.Software + --hiveMetastoreUris${hiveMetastoreUris} + + + + + + + + yarn + cluster + Import table datasource + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${inputPath}/datasource + --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.Datasource + --hiveMetastoreUris${hiveMetastoreUris} + + + + + + + + yarn + cluster + Import table organization + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${inputPath}/organization + --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.Organization + --hiveMetastoreUris${hiveMetastoreUris} + + + + + + + + yarn + cluster + Import table project + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${inputPath}/project + --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.Project + --hiveMetastoreUris${hiveMetastoreUris} + + + + + + + - ${jobTracker} - ${nameNode} hive.metastore.uris @@ -122,4 +305,5 @@ + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_db_importer_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_db_importer_parameters.json new file mode 100644 index 000000000..d6c13773a --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_db_importer_parameters.json @@ -0,0 +1,26 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "in", + "paramLongName": "inputPath", + "paramDescription": "the path to the graph data dump to read", + "paramRequired": true + }, + { + "paramName": "hmu", + "paramLongName": "hiveMetastoreUris", + "paramDescription": "the hive metastore uris", + "paramRequired": true + }, + { + "paramName": "db", + "paramLongName": "hiveDbName", + "paramDescription": "the target hive database name", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_table_importer_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_table_importer_parameters.json new file mode 100644 index 000000000..5b5b0743c --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_table_importer_parameters.json @@ -0,0 +1,32 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "in", + "paramLongName": "inputPath", + "paramDescription": "the path to the graph data dump to read", + "paramRequired": true + }, + { + "paramName": "hmu", + "paramLongName": "hiveMetastoreUris", + "paramDescription": "the hive metastore uris", + "paramRequired": true + }, + { + "paramName": "db", + "paramLongName": "hiveDbName", + "paramDescription": "the target hive database name", + "paramRequired": true + }, + { + "paramName": "tn", + "paramLongName": "className", + "paramDescription": "the class modelling the target table", + "paramRequired": true + } +] \ No newline at end of file From fd62359538d397cfd9e73a3602e38638a5a0ff1d Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 09:28:15 +0200 Subject: [PATCH 10/23] cleanup --- .../graph/migrate_actionsets_parameters.json | 10 ---------- .../eu/dnetlib/dhp/oa/graph/scriptFile.scala | 0 .../transform_actionsets_parameters.json | 20 ------------------- 3 files changed, 30 deletions(-) delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_actionsets_parameters.json create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/scriptFile.scala delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/transform_actionsets_parameters.json diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_actionsets_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_actionsets_parameters.json deleted file mode 100644 index c4910ec61..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_actionsets_parameters.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - {"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true}, - {"paramName":"sn", "paramLongName":"sourceNameNode", "paramDescription": "nameNode of the source cluster", "paramRequired": true}, - {"paramName":"tn", "paramLongName":"targetNameNode", "paramDescription": "namoNode of the target cluster", "paramRequired": true}, - {"paramName":"w", "paramLongName":"workingDirectory", "paramDescription": "working directory", "paramRequired": true}, - {"paramName":"nm", "paramLongName":"distcp_num_maps", "paramDescription": "maximum number of map tasks used in the distcp process", "paramRequired": true}, - {"paramName":"mm", "paramLongName":"distcp_memory_mb", "paramDescription": "memory for distcp action copying actionsets from remote cluster", "paramRequired": true}, - {"paramName":"tt", "paramLongName":"distcp_task_timeout", "paramDescription": "timeout for distcp copying actions from remote cluster", "paramRequired": true}, - {"paramName":"tr", "paramLongName":"transform_only", "paramDescription": "activate tranform-only mode. Only apply transformation step", "paramRequired": true} -] diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/scriptFile.scala b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/scriptFile.scala new file mode 100644 index 000000000..e69de29bb diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/transform_actionsets_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/transform_actionsets_parameters.json deleted file mode 100644 index 6fa10f739..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/transform_actionsets_parameters.json +++ /dev/null @@ -1,20 +0,0 @@ -[ - { - "paramName": "mt", - "paramLongName": "master", - "paramDescription": "should be local or yarn", - "paramRequired": true - }, - { - "paramName": "is", - "paramLongName": "isLookupUrl", - "paramDescription": "URL of the isLookUp Service", - "paramRequired": true - }, - { - "paramName": "i", - "paramLongName": "inputPaths", - "paramDescription": "URL of the isLookUp Service", - "paramRequired": true - } -] From 9d028ffe1c91c270482c75d5860e9f0222c01341 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 09:28:55 +0200 Subject: [PATCH 11/23] cleanup --- .../src/main/resources/eu/dnetlib/dhp/oa/graph/scriptFile.scala | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/scriptFile.scala diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/scriptFile.scala b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/scriptFile.scala deleted file mode 100644 index e69de29bb..000000000 From 18f46e47b915a3770e584765bce4e4d281f3668f Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 09:34:48 +0200 Subject: [PATCH 12/23] added relations to the graph2hive import workflow --- .../dhp/oa/graph/hive/oozie_app/workflow.xml | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml index 2bcbbba5e..8566d7667 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml @@ -102,6 +102,7 @@ + @@ -286,6 +287,32 @@ + + + yarn + cluster + Import table project + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${inputPath}/relation + --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.Relation + --hiveMetastoreUris${hiveMetastoreUris} + + + + + From 50d6a2ad3c2f6fe19a52ff279749bd5633917fd6 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 09:53:37 +0200 Subject: [PATCH 13/23] added output directory removal in the blacklist spark actions; included common global properties in blacklist's workflow.xml --- .../blacklist/PrepareMergedRelationJob.java | 7 ++++ .../SparkRemoveBlacklistedRelationJob.java | 7 +++- .../dhp/blacklist/oozie_app/workflow.xml | 37 +++++++++++-------- 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java index 0ef59e8c2..2a46043e2 100644 --- a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java +++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java @@ -5,6 +5,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.util.Optional; +import eu.dnetlib.dhp.common.HdfsSupport; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -56,6 +57,7 @@ public class PrepareMergedRelationJob { conf, isSparkSessionManaged, spark -> { + removeOutputDir(spark, outputPath); selectMergesRelations( spark, inputPath, @@ -84,4 +86,9 @@ public class PrepareMergedRelationJob { (MapFunction) value -> OBJECT_MAPPER.readValue(value, Relation.class), Encoders.bean(Relation.class)); } + + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } + } diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java index 86587bfc9..fe4310217 100644 --- a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java +++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java @@ -6,6 +6,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.util.Objects; import java.util.Optional; +import eu.dnetlib.dhp.common.HdfsSupport; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -62,6 +63,7 @@ public class SparkRemoveBlacklistedRelationJob { conf, isSparkSessionManaged, spark -> { + removeOutputDir(spark, outputPath); removeBlacklistedRelations( spark, blacklistPath, @@ -69,7 +71,6 @@ public class SparkRemoveBlacklistedRelationJob { outputPath, mergesPath); }); - } private static void removeBlacklistedRelations(SparkSession spark, String blacklistPath, String inputPath, @@ -144,4 +145,8 @@ public class SparkRemoveBlacklistedRelationJob { Encoders.bean(Relation.class)); } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } + } diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml index 1538318c1..dd7827da4 100644 --- a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml @@ -22,6 +22,25 @@ + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + @@ -49,8 +68,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/publication ${nameNode}/${outputPath}/publication @@ -60,8 +77,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/dataset ${nameNode}/${outputPath}/dataset @@ -71,8 +86,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/otherresearchproduct ${nameNode}/${outputPath}/otherresearchproduct @@ -82,8 +95,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/software ${nameNode}/${outputPath}/software @@ -93,8 +104,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/organization ${nameNode}/${outputPath}/organization @@ -104,8 +113,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/project ${nameNode}/${outputPath}/project @@ -115,8 +122,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/datasource ${nameNode}/${outputPath}/datasource @@ -128,8 +133,6 @@ - ${jobTracker} - ${nameNode} eu.dnetlib.dhp.blacklist.ReadBlacklistFromDB --hdfsPath${workingDir}/blacklist --hdfsNameNode${nameNode} @@ -156,6 +159,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 --sourcePath${sourcePath}/relation --outputPath${workingDir}/mergesRelation @@ -180,6 +184,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 --sourcePath${sourcePath}/relation --outputPath${outputPath}/relation From b7e198475a7fdfacf5cdc3ea39d35f58af657d0c Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 10:20:07 +0200 Subject: [PATCH 14/23] added common methods to create HiveDB table identifiers --- .../dhp/schema/common/ModelSupport.java | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java index fc85b1ac1..9ee7c2deb 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java @@ -1,10 +1,15 @@ package eu.dnetlib.dhp.schema.common; +import static com.google.common.base.Preconditions.checkArgument; + import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.function.Function; +import org.apache.commons.lang3.StringUtils; + import com.google.common.collect.Maps; import eu.dnetlib.dhp.schema.oaf.*; @@ -379,6 +384,21 @@ public class ModelSupport { entityMapping.get(EntityType.valueOf(targetType)).name()); } + public static String tableIdentifier(String dbName, String tableName) { + + checkArgument(StringUtils.isNotBlank(dbName), "DB name cannot be empty"); + checkArgument(StringUtils.isNotBlank(tableName), "table name cannot be empty"); + + return String.format("%s.%s", dbName, tableName); + } + + public static String tableIdentifier(String dbName, Class clazz) { + + checkArgument(Objects.nonNull(clazz), "clazz is needed to derive the table name, thus cannot be null"); + + return tableIdentifier(dbName, clazz.getSimpleName().toLowerCase()); + } + public static Function idFn() { return x -> { if (isSubClass(x, Relation.class)) { From a83265829608c6a318e0a35cbdc7abc95dd9d1b6 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 10:21:09 +0200 Subject: [PATCH 15/23] code formatting --- .../dhp/blacklist/PrepareMergedRelationJob.java | 2 +- .../blacklist/SparkRemoveBlacklistedRelationJob.java | 2 +- .../java/eu/dnetlib/dhp/PropagationConstant.java | 4 ++-- .../PrepareDatasourceCountryAssociation.java | 2 +- .../PrepareResultOrcidAssociationStep1.java | 5 ++--- .../SparkOrcidToResultFromSemRelJob.java | 8 ++++---- .../PrepareProjectResultsAssociation.java | 2 +- .../SparkResultToProjectThroughSemRelJob.java | 6 +++--- .../PrepareResultCommunitySet.java | 2 +- .../PrepareResultInstRepoAssociation.java | 2 +- .../SparkResultToOrganizationFromIstRepoJob.java | 8 ++++---- .../oa/graph/raw/MigrateDbEntitiesApplication.java | 12 ++++++------ 12 files changed, 27 insertions(+), 28 deletions(-) diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java index 2a46043e2..b4bcc509e 100644 --- a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java +++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java @@ -5,7 +5,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.util.Optional; -import eu.dnetlib.dhp.common.HdfsSupport; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -19,6 +18,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.oaf.Relation; public class PrepareMergedRelationJob { diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java index fe4310217..92289ec2d 100644 --- a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java +++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java @@ -6,7 +6,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.util.Objects; import java.util.Optional; -import eu.dnetlib.dhp.common.HdfsSupport; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -19,6 +18,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.oaf.Relation; import scala.Tuple2; diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java index 13ed46508..c8eb017c7 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java @@ -4,8 +4,6 @@ package eu.dnetlib.dhp; import java.util.List; import java.util.Optional; -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.common.ModelSupport; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -17,6 +15,8 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; public class PropagationConstant { diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java index 1d01d35e5..98b573102 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java @@ -7,7 +7,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.util.Arrays; import java.util.List; -import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; @@ -22,6 +21,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; /** diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java index 6549d1ed2..b15f813ac 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java @@ -95,8 +95,7 @@ public class PrepareResultOrcidAssociationStep1 { Dataset result = readPath(spark, inputResultPath, resultClazz); result.createOrReplaceTempView("result"); - String query = - "SELECT target resultId, author authorList" + String query = "SELECT target resultId, author authorList" + " FROM (SELECT id, collect_set(named_struct('name', name, 'surname', surname, 'fullname', fullname, 'orcid', orcid)) author " + " FROM ( " + " SELECT DISTINCT id, MyT.fullname, MyT.name, MyT.surname, MyP.value orcid " @@ -109,7 +108,7 @@ public class PrepareResultOrcidAssociationStep1 { + " SELECT source, target " + " FROM relation " + " WHERE datainfo.deletedbyinference = false " - + getConstraintList(" relclass = '", allowedsemrel) + + getConstraintList(" relclass = '", allowedsemrel) + " ) rel_rel " + " ON source = id"; spark diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java index fd1de3282..bea847ca7 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java @@ -151,10 +151,10 @@ public class SparkOrcidToResultFromSemRelJob { // in one of the two else { if (autoritative_author - .getName() - .trim() - .substring(0, 0) - .equalsIgnoreCase(author.getName().trim().substring(0, 0))) { + .getName() + .trim() + .substring(0, 0) + .equalsIgnoreCase(author.getName().trim().substring(0, 0))) { toaddpid = true; } } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java index 920ef1f06..4cd7f88df 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java @@ -8,7 +8,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.util.Arrays; import java.util.List; -import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; @@ -22,6 +21,7 @@ import com.google.gson.Gson; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Relation; public class PrepareProjectResultsAssociation { diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java index 44a439ab7..1f6264c18 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java @@ -9,7 +9,6 @@ import java.util.Iterator; import java.util.List; import java.util.Optional; -import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FlatMapFunction; @@ -21,6 +20,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Relation; import scala.Tuple2; @@ -124,7 +124,7 @@ public class SparkResultToProjectThroughSemRelJob { resId, projectId, ModelConstants.IS_PRODUCED_BY, - ModelConstants.RESULT_PROJECT , + ModelConstants.RESULT_PROJECT, ModelConstants.OUTCOME, PROPAGATION_DATA_INFO_TYPE, PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID, @@ -135,7 +135,7 @@ public class SparkResultToProjectThroughSemRelJob { projectId, resId, ModelConstants.PRODUCES, - ModelConstants.RESULT_PROJECT, + ModelConstants.RESULT_PROJECT, ModelConstants.OUTCOME, PROPAGATION_DATA_INFO_TYPE, PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID, diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java index fc9f395d1..5574aad75 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java @@ -6,7 +6,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.util.*; -import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -18,6 +17,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.google.gson.Gson; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Relation; public class PrepareResultCommunitySet { diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java index 0a83e4195..84e40fa88 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java @@ -4,7 +4,6 @@ package eu.dnetlib.dhp.resulttoorganizationfrominstrepo; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; -import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; @@ -18,6 +17,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Datasource; import eu.dnetlib.dhp.schema.oaf.Organization; import eu.dnetlib.dhp.schema.oaf.Relation; diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java index fc9a46d72..0ce741b87 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java @@ -6,7 +6,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.util.*; -import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; @@ -20,6 +19,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; @@ -151,7 +151,7 @@ public class SparkResultToOrganizationFromIstRepoJob { getRelation( orgId, resultId, - ModelConstants.IS_AUTHOR_INSTITUTION_OF, + ModelConstants.IS_AUTHOR_INSTITUTION_OF, ModelConstants.RESULT_ORGANIZATION, ModelConstants.AFFILIATION, PROPAGATION_DATA_INFO_TYPE, @@ -163,8 +163,8 @@ public class SparkResultToOrganizationFromIstRepoJob { resultId, orgId, ModelConstants.HAS_AUTHOR_INSTITUTION, - ModelConstants.RESULT_ORGANIZATION, - ModelConstants.AFFILIATION, + ModelConstants.RESULT_ORGANIZATION, + ModelConstants.AFFILIATION, PROPAGATION_DATA_INFO_TYPE, PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID, PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME)); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index ebe2b703b..5b8296c19 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -453,12 +453,12 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i final Boolean inferred = rs.getBoolean("inferred"); final String trust = rs.getString("trust"); return dataInfo( - deletedbyinference, - inferenceprovenance, - inferred, - false, - ENTITYREGISTRY_PROVENANCE_ACTION, - trust); + deletedbyinference, + inferenceprovenance, + inferred, + false, + ENTITYREGISTRY_PROVENANCE_ACTION, + trust); } private Qualifier prepareQualifierSplitting(final String s) { From 2a4e68a292d98e261527ab267c054cec9ae3aad9 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Fri, 15 May 2020 12:25:37 +0200 Subject: [PATCH 16/23] events recognition --- .../broker/oa/GenerateEventsApplication.java | 22 ++++---- .../EnrichMissingAbstract.java | 3 +- .../EnrichMissingAuthorOrcid.java | 3 +- .../oa/matchers/EnrichMissingOpenAccess.java | 55 +++++++++++++++++++ .../{util => matchers}/EnrichMissingPid.java | 19 ++++++- .../EnrichMissingProject.java | 6 +- .../EnrichMissingPublicationDate.java | 3 +- .../oa/matchers/EnrichMissingSubject.java | 53 ++++++++++++++++++ .../oa/matchers/EnrichMoreOpenAccess.java | 53 ++++++++++++++++++ .../oa/{util => matchers}/EnrichMorePid.java | 22 ++++++-- .../{util => matchers}/EnrichMoreSubject.java | 28 +++++++--- .../oa/{util => matchers}/UpdateMatcher.java | 3 +- .../dhp/broker/oa/util/BrokerConstants.java | 7 +++ .../dhp/broker/oa/util/ConversionUtils.java | 36 ++++++++++++ .../oa/util/EnrichMissingOpenAccess.java | 33 ----------- .../broker/oa/util/EnrichMissingSubject.java | 40 -------------- .../broker/oa/util/EnrichMoreOpenAccess.java | 33 ----------- .../dhp/broker/oa/util/UpdateInfo.java | 2 +- 18 files changed, 281 insertions(+), 140 deletions(-) rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/{util => matchers}/EnrichMissingAbstract.java (90%) rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/{util => matchers}/EnrichMissingAuthorOrcid.java (91%) create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingOpenAccess.java rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/{util => matchers}/EnrichMissingPid.java (60%) rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/{util => matchers}/EnrichMissingProject.java (86%) rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/{util => matchers}/EnrichMissingPublicationDate.java (89%) create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSubject.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreOpenAccess.java rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/{util => matchers}/EnrichMorePid.java (50%) rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/{util => matchers}/EnrichMoreSubject.java (51%) rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/{util => matchers}/UpdateMatcher.java (94%) create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java delete mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingOpenAccess.java delete mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingSubject.java delete mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreOpenAccess.java diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java index c4c167c13..43ebd6dd8 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java @@ -17,18 +17,18 @@ import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.broker.model.Event; import eu.dnetlib.dhp.broker.model.EventFactory; -import eu.dnetlib.dhp.broker.oa.util.EnrichMissingAbstract; -import eu.dnetlib.dhp.broker.oa.util.EnrichMissingAuthorOrcid; -import eu.dnetlib.dhp.broker.oa.util.EnrichMissingOpenAccess; -import eu.dnetlib.dhp.broker.oa.util.EnrichMissingPid; -import eu.dnetlib.dhp.broker.oa.util.EnrichMissingProject; -import eu.dnetlib.dhp.broker.oa.util.EnrichMissingPublicationDate; -import eu.dnetlib.dhp.broker.oa.util.EnrichMissingSubject; -import eu.dnetlib.dhp.broker.oa.util.EnrichMoreOpenAccess; -import eu.dnetlib.dhp.broker.oa.util.EnrichMorePid; -import eu.dnetlib.dhp.broker.oa.util.EnrichMoreSubject; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingAbstract; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingAuthorOrcid; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingOpenAccess; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPid; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingProject; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationDate; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingSubject; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreOpenAccess; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMorePid; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreSubject; +import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; -import eu.dnetlib.dhp.broker.oa.util.UpdateMatcher; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.oaf.Result; diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAbstract.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAbstract.java similarity index 90% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAbstract.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAbstract.java index 6b6e35d1d..43cf738f8 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAbstract.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAbstract.java @@ -1,11 +1,12 @@ -package eu.dnetlib.dhp.broker.oa.util; +package eu.dnetlib.dhp.broker.oa.matchers; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; public class EnrichMissingAbstract extends UpdateMatcher { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAuthorOrcid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAuthorOrcid.java similarity index 91% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAuthorOrcid.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAuthorOrcid.java index d81427e05..beeccdbe8 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAuthorOrcid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAuthorOrcid.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.broker.oa.util; +package eu.dnetlib.dhp.broker.oa.matchers; import java.util.Arrays; import java.util.List; @@ -7,6 +7,7 @@ import java.util.List; import org.apache.commons.lang3.tuple.Pair; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; public class EnrichMissingAuthorOrcid extends UpdateMatcher> { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingOpenAccess.java new file mode 100644 index 000000000..a4a2ea0c6 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingOpenAccess.java @@ -0,0 +1,55 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +import eu.dnetlib.broker.objects.Instance; +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingOpenAccess extends UpdateMatcher { + + public EnrichMissingOpenAccess() { + super(true); + } + + @Override + protected List> findUpdates(final Result source, final Result target) { + final long count = target + .getInstance() + .stream() + .map(i -> i.getAccessright().getClassid()) + .filter(right -> right.equals(BrokerConstants.OPEN_ACCESS)) + .count(); + + if (count > 0) { + return Arrays.asList(); + } + + return source + .getInstance() + .stream() + .filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS)) + .map(ConversionUtils::oafInstanceToBrokerInstances) + .flatMap(s -> s) + .map(i -> generateUpdateInfo(i, source, target)) + .collect(Collectors.toList()); + } + + @Override + public UpdateInfo generateUpdateInfo(final Instance highlightValue, + final Result source, + final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_OA_VERSION, + highlightValue, source, target, + (p, i) -> p.getInstances().add(i), + Instance::getUrl); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPid.java similarity index 60% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPid.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPid.java index 0b4045a0e..a8df62541 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPid.java @@ -1,11 +1,14 @@ -package eu.dnetlib.dhp.broker.oa.util; +package eu.dnetlib.dhp.broker.oa.matchers; import java.util.Arrays; import java.util.List; +import java.util.stream.Collectors; import eu.dnetlib.broker.objects.Pid; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; public class EnrichMissingPid extends UpdateMatcher { @@ -16,8 +19,18 @@ public class EnrichMissingPid extends UpdateMatcher { @Override protected List> findUpdates(final Result source, final Result target) { - // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); - return Arrays.asList(); + final long count = target.getPid().size(); + + if (count > 0) { + return Arrays.asList(); + } + + return source + .getPid() + .stream() + .map(ConversionUtils::oafPidToBrokerPid) + .map(i -> generateUpdateInfo(i, source, target)) + .collect(Collectors.toList()); } @Override diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingProject.java similarity index 86% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingProject.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingProject.java index 45b16801c..b6e5b3b57 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingProject.java @@ -1,11 +1,12 @@ -package eu.dnetlib.dhp.broker.oa.util; +package eu.dnetlib.dhp.broker.oa.matchers; import java.util.Arrays; import java.util.List; import eu.dnetlib.broker.objects.Project; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; public class EnrichMissingProject extends UpdateMatcher { @@ -21,7 +22,8 @@ public class EnrichMissingProject extends UpdateMatcher { } @Override - public UpdateInfo generateUpdateInfo(final Project highlightValue, final Result source, + public UpdateInfo generateUpdateInfo(final Project highlightValue, + final Result source, final Result target) { return new UpdateInfo<>( Topic.ENRICH_MISSING_PROJECT, diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPublicationDate.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java similarity index 89% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPublicationDate.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java index 7fcd2a66f..e9ec082c4 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPublicationDate.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java @@ -1,10 +1,11 @@ -package eu.dnetlib.dhp.broker.oa.util; +package eu.dnetlib.dhp.broker.oa.matchers; import java.util.Arrays; import java.util.List; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; public class EnrichMissingPublicationDate extends UpdateMatcher { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSubject.java new file mode 100644 index 000000000..79e9d469b --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSubject.java @@ -0,0 +1,53 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public class EnrichMissingSubject extends UpdateMatcher> { + + public EnrichMissingSubject() { + super(true); + } + + @Override + protected List>> findUpdates(final Result source, final Result target) { + final Set existingTypes = target + .getSubject() + .stream() + .map(StructuredProperty::getQualifier) + .map(Qualifier::getClassid) + .collect(Collectors.toSet()); + + return source + .getPid() + .stream() + .filter(pid -> !existingTypes.contains(pid.getQualifier().getClassid())) + .map(ConversionUtils::oafSubjectToPair) + .map(i -> generateUpdateInfo(i, source, target)) + .collect(Collectors.toList()); + } + + @Override + public UpdateInfo> generateUpdateInfo(final Pair highlightValue, + final Result source, + final Result target) { + + return new UpdateInfo<>( + Topic.fromPath("ENRICH/MISSING/SUBJECT/" + highlightValue.getLeft()), + highlightValue, source, target, + (p, pair) -> p.getSubjects().add(pair.getRight()), + pair -> pair.getLeft() + "::" + pair.getRight()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreOpenAccess.java new file mode 100644 index 000000000..40c9b0500 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreOpenAccess.java @@ -0,0 +1,53 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import eu.dnetlib.broker.objects.Instance; +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMoreOpenAccess extends UpdateMatcher { + + public EnrichMoreOpenAccess() { + super(true); + } + + @Override + protected List> findUpdates(final Result source, final Result target) { + final Set urls = target + .getInstance() + .stream() + .filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS)) + .map(i -> i.getUrl()) + .flatMap(List::stream) + .collect(Collectors.toSet()); + + return source + .getInstance() + .stream() + .filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS)) + .map(ConversionUtils::oafInstanceToBrokerInstances) + .flatMap(s -> s) + .filter(i -> !urls.contains(i.getUrl())) + .map(i -> generateUpdateInfo(i, source, target)) + .collect(Collectors.toList()); + } + + @Override + public UpdateInfo generateUpdateInfo(final Instance highlightValue, + final Result source, + final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MORE_OA_VERSION, + highlightValue, source, target, + (p, i) -> p.getInstances().add(i), + Instance::getUrl); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMorePid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMorePid.java similarity index 50% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMorePid.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMorePid.java index 8cd67f553..0e7b7766a 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMorePid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMorePid.java @@ -1,11 +1,14 @@ -package eu.dnetlib.dhp.broker.oa.util; +package eu.dnetlib.dhp.broker.oa.matchers; -import java.util.Arrays; import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; import eu.dnetlib.broker.objects.Pid; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; public class EnrichMorePid extends UpdateMatcher { @@ -16,8 +19,19 @@ public class EnrichMorePid extends UpdateMatcher { @Override protected List> findUpdates(final Result source, final Result target) { - // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); - return Arrays.asList(); + final Set existingPids = target + .getPid() + .stream() + .map(pid -> pid.getQualifier().getClassid() + "::" + pid.getValue()) + .collect(Collectors.toSet()); + + return source + .getPid() + .stream() + .filter(pid -> !existingPids.contains(pid.getQualifier().getClassid() + "::" + pid.getValue())) + .map(ConversionUtils::oafPidToBrokerPid) + .map(i -> generateUpdateInfo(i, source, target)) + .collect(Collectors.toList()); } @Override diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSubject.java similarity index 51% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreSubject.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSubject.java index 9e0d8e693..e6374479b 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSubject.java @@ -1,12 +1,15 @@ -package eu.dnetlib.dhp.broker.oa.util; +package eu.dnetlib.dhp.broker.oa.matchers; -import java.util.Arrays; import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; import org.apache.commons.lang3.tuple.Pair; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; public class EnrichMoreSubject extends UpdateMatcher> { @@ -17,18 +20,25 @@ public class EnrichMoreSubject extends UpdateMatcher> { @Override protected List>> findUpdates(final Result source, final Result target) { - // MESHEUROPMC - // ARXIV - // JEL - // DDC - // ACM + final Set existingSubjects = target + .getSubject() + .stream() + .map(pid -> pid.getQualifier().getClassid() + "::" + pid.getValue()) + .collect(Collectors.toSet()); - return Arrays.asList(); + return source + .getPid() + .stream() + .filter(pid -> !existingSubjects.contains(pid.getQualifier().getClassid() + "::" + pid.getValue())) + .map(ConversionUtils::oafSubjectToPair) + .map(i -> generateUpdateInfo(i, source, target)) + .collect(Collectors.toList()); } @Override public UpdateInfo> generateUpdateInfo(final Pair highlightValue, - final Result source, final Result target) { + final Result source, + final Result target) { return new UpdateInfo<>( Topic.fromPath("ENRICH/MORE/SUBJECT/" + highlightValue.getLeft()), diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateMatcher.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java similarity index 94% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateMatcher.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java index 3fd6d4027..b8b6132cd 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateMatcher.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.broker.oa.util; +package eu.dnetlib.dhp.broker.oa.matchers; import java.util.Arrays; import java.util.Collection; @@ -10,6 +10,7 @@ import java.util.Map; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Field; import eu.dnetlib.dhp.schema.oaf.Result; diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java new file mode 100644 index 000000000..d61d5bfb7 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java @@ -0,0 +1,7 @@ + +package eu.dnetlib.dhp.broker.oa.util; + +public class BrokerConstants { + + public final static String OPEN_ACCESS = "OPEN"; +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java new file mode 100644 index 000000000..2e2ce202a --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java @@ -0,0 +1,36 @@ + +package eu.dnetlib.dhp.broker.oa.util; + +import java.util.stream.Stream; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.broker.objects.Instance; +import eu.dnetlib.broker.objects.Pid; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public class ConversionUtils { + + public static Stream oafInstanceToBrokerInstances(final eu.dnetlib.dhp.schema.oaf.Instance i) { + return i.getUrl().stream().map(url -> { + final Instance r = new Instance(); + r.setUrl(url); + r.setInstancetype(i.getInstancetype().getClassid()); + r.setLicense(BrokerConstants.OPEN_ACCESS); + r.setHostedby(i.getHostedby().getValue()); + return r; + }); + } + + public static Pid oafPidToBrokerPid(final StructuredProperty sp) { + final Pid pid = new Pid(); + pid.setValue(sp.getValue()); + pid.setType(sp.getQualifier().getClassid()); + return pid; + } + + public static final Pair oafSubjectToPair(final StructuredProperty sp) { + return Pair.of(sp.getQualifier().getClassid(), sp.getValue()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingOpenAccess.java deleted file mode 100644 index 9079ee24b..000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingOpenAccess.java +++ /dev/null @@ -1,33 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa.util; - -import java.util.Arrays; -import java.util.List; - -import eu.dnetlib.broker.objects.Instance; -import eu.dnetlib.dhp.broker.model.Topic; -import eu.dnetlib.dhp.schema.oaf.Result; - -public class EnrichMissingOpenAccess extends UpdateMatcher { - - public EnrichMissingOpenAccess() { - super(true); - } - - @Override - protected List> findUpdates(final Result source, final Result target) { - - return Arrays.asList(); - } - - @Override - public UpdateInfo generateUpdateInfo(final Instance highlightValue, final Result source, - final Result target) { - return new UpdateInfo<>( - Topic.ENRICH_MISSING_OA_VERSION, - highlightValue, source, target, - (p, i) -> p.getInstances().add(i), - Instance::getUrl); - } - -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingSubject.java deleted file mode 100644 index 4470bd9d9..000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingSubject.java +++ /dev/null @@ -1,40 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa.util; - -import java.util.Arrays; -import java.util.List; - -import org.apache.commons.lang3.tuple.Pair; - -import eu.dnetlib.dhp.broker.model.Topic; -import eu.dnetlib.dhp.schema.oaf.Result; - -public class EnrichMissingSubject extends UpdateMatcher> { - - public EnrichMissingSubject() { - super(true); - } - - @Override - protected List>> findUpdates(final Result source, final Result target) { - // MESHEUROPMC - // ARXIV - // JEL - // DDC - // ACM - - return Arrays.asList(); - } - - @Override - public UpdateInfo> generateUpdateInfo(final Pair highlightValue, - final Result source, final Result target) { - - return new UpdateInfo<>( - Topic.fromPath("ENRICH/MISSING/SUBJECT/" + highlightValue.getLeft()), - highlightValue, source, target, - (p, pair) -> p.getSubjects().add(pair.getRight()), - pair -> pair.getLeft() + "::" + pair.getRight()); - } - -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreOpenAccess.java deleted file mode 100644 index bc37ce659..000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreOpenAccess.java +++ /dev/null @@ -1,33 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa.util; - -import java.util.Arrays; -import java.util.List; - -import eu.dnetlib.broker.objects.Instance; -import eu.dnetlib.dhp.broker.model.Topic; -import eu.dnetlib.dhp.schema.oaf.Result; - -public class EnrichMoreOpenAccess extends UpdateMatcher { - - public EnrichMoreOpenAccess() { - super(true); - } - - @Override - protected List> findUpdates(final Result source, final Result target) { - // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); - return Arrays.asList(); - } - - @Override - public UpdateInfo generateUpdateInfo(final Instance highlightValue, final Result source, - final Result target) { - return new UpdateInfo<>( - Topic.ENRICH_MORE_OA_VERSION, - highlightValue, source, target, - (p, i) -> p.getInstances().add(i), - Instance::getUrl); - } - -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java index 1dfc14e5e..5cc0d371d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java @@ -25,7 +25,7 @@ public final class UpdateInfo { private final float trust; - protected UpdateInfo(final Topic topic, final T highlightValue, final Result source, final Result target, + public UpdateInfo(final Topic topic, final T highlightValue, final Result source, final Result target, final BiConsumer compileHighlight, final Function highlightToString) { this.topic = topic; From cfc8948717b4f5da506af7fc3c0a3d230ecb4c69 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 12:26:16 +0200 Subject: [PATCH 17/23] fixed mapping OdfToGraph: pick the correct element to map author pids and author affiliations; extended mapping Oaf2Graph: added support for author pids --- .../dhp/oa/graph/raw/OafToOafMapper.java | 20 +++++-- .../dhp/oa/graph/raw/OdfToOafMapper.java | 8 +-- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 55 +++++++++++++++++-- .../dnetlib/dhp/oa/graph/raw/oaf_record.xml | 2 +- .../dnetlib/dhp/oa/graph/raw/odf_dataset.xml | 3 +- 5 files changed, 71 insertions(+), 17 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index 891fee57e..6b6aa15e8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -1,15 +1,16 @@ package eu.dnetlib.dhp.oa.graph.raw; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; +import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*; import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import java.util.*; import java.util.stream.Collectors; +import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; +import org.dom4j.Element; import org.dom4j.Node; import com.google.common.collect.Lists; @@ -28,15 +29,24 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final List res = new ArrayList<>(); int pos = 1; for (final Object o : doc.selectNodes("//dc:creator")) { - final Node n = (Node) o; + final Element e = (Element) o; final Author author = new Author(); - author.setFullname(n.getText()); + author.setFullname(e.getText()); author.setRank(pos++); - final PacePerson p = new PacePerson(n.getText(), false); + final PacePerson p = new PacePerson(e.getText(), false); if (p.isAccurate()) { author.setName(p.getNormalisedFirstName()); author.setSurname(p.getNormalisedSurname()); } + + final String pid = e.attributeValue("nameIdentifier"); + final String pidType = e.attributeValue("nameIdentifierScheme"); + + if (StringUtils.isNotBlank(pid) && StringUtils.isNotBlank(pidType)) { + author.setPid(new ArrayList<>()); + author.getPid().add(structuredProperty(pid, qualifier(pidType, pidType, DNET_PID_TYPES, DNET_PID_TYPES), info)); + } + res.add(author); } return res; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 5baac12fd..30b980c42 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -63,17 +63,17 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { author.setSurname(surname); } - author.setAffiliation(prepareListFields(doc, "./datacite:affiliation", info)); - author.setPid(preparePids(doc, info)); + author.setAffiliation(prepareListFields(n, "./datacite:affiliation", info)); + author.setPid(preparePids(n, info)); author.setRank(pos++); res.add(author); } return res; } - private List preparePids(final Document doc, final DataInfo info) { + private List preparePids(final Node n, final DataInfo info) { final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("./datacite:nameIdentifier")) { + for (final Object o : n.selectNodes("./datacite:nameIdentifier")) { res .add( structuredProperty( diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 951c97d9d..d7635c9ea 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -10,7 +10,10 @@ import static org.mockito.Mockito.when; import java.io.IOException; import java.util.List; import java.util.Map; +import java.util.Optional; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.*; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.BeforeEach; @@ -19,12 +22,6 @@ import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; -import eu.dnetlib.dhp.schema.oaf.Dataset; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.oaf.Software; - @ExtendWith(MockitoExtension.class) public class MappersTest { @@ -54,7 +51,26 @@ public class MappersTest { assertValidId(p.getId()); assertValidId(p.getCollectedfrom().get(0).getKey()); assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); + assertTrue(p.getAuthor().size() > 0); + Optional author = p.getAuthor() + .stream() + .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) + .findFirst(); + assertTrue(author.isPresent()); + StructuredProperty pid = author.get().getPid() + .stream() + .findFirst() + .get(); + assertEquals("0000-0001-6651-1178", pid.getValue()); + assertEquals("ORCID", pid.getQualifier().getClassid()); + assertEquals("ORCID", pid.getQualifier().getClassname()); + assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid()); + assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename()); + assertEquals("Votsi,Nefta", author.get().getFullname()); + assertEquals("Votsi", author.get().getSurname()); + assertEquals("Nefta", author.get().getName()); + assertTrue(p.getSubject().size() > 0); assertTrue(StringUtils.isNotBlank(p.getJournal().getIssnOnline())); assertTrue(StringUtils.isNotBlank(p.getJournal().getName())); @@ -100,6 +116,33 @@ public class MappersTest { assertValidId(d.getCollectedfrom().get(0).getKey()); assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue())); assertTrue(d.getAuthor().size() > 0); + + Optional author = d.getAuthor() + .stream() + .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) + .findFirst(); + assertTrue(author.isPresent()); + StructuredProperty pid = author.get().getPid() + .stream() + .findFirst() + .get(); + assertEquals("0000-0001-9074-1619", pid.getValue()); + assertEquals("ORCID", pid.getQualifier().getClassid()); + assertEquals("ORCID", pid.getQualifier().getClassname()); + assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid()); + assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename()); + assertEquals("Baracchini, Theo", author.get().getFullname()); + assertEquals("Baracchini", author.get().getSurname()); + assertEquals("Theo", author.get().getName()); + + assertEquals(1, author.get().getAffiliation().size()); + Optional> opAff = author.get().getAffiliation() + .stream() + .findFirst(); + assertTrue(opAff.isPresent()); + Field affiliation = opAff.get(); + assertEquals("ISTI-CNR", affiliation.getValue()); + assertTrue(d.getSubject().size() > 0); assertTrue(d.getInstance().size() > 0); assertTrue(d.getContext().size() > 0); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml index e898d4434..2cb0ba1c7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml @@ -19,7 +19,7 @@ Ecosystem Service capacity is higher in areas of multiple designation types Nikolaidou,Charitini - Votsi,Nefta + Votsi,Nefta Sgardelis,Steanos Halley,John Pantis,John diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml index 94dc802fa..88ae9d106 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml @@ -35,9 +35,10 @@ Baracchini, Theo + 0000-0001-9074-1619 Theo Baracchini - Physics of Aquatic Systems Laboratory (APHYS) – Margaretha Kamprad Chair, ENAC, EPFL, Lausanne, 1015, Switzerland + ISTI-CNR Wüest, Alfred From 5ec8c49ad5144d056512ac25ea3a8444e71ca4f1 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 15 May 2020 12:49:58 +0200 Subject: [PATCH 18/23] removed serialization points --- .../dhp/blacklist/SparkRemoveBlacklistedRelationJob.java | 7 ------- 1 file changed, 7 deletions(-) diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java index 92289ec2d..91bcb9d1c 100644 --- a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java +++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java @@ -79,8 +79,6 @@ public class SparkRemoveBlacklistedRelationJob { Dataset inputRelation = readRelations(spark, inputPath); Dataset mergesRelation = readRelations(spark, mergesPath); - log.info("InputRelationCount: {}", inputRelation.count()); - Dataset dedupSource = blackListed .joinWith( mergesRelation, blackListed.col("source").equalTo(mergesRelation.col("target")), @@ -103,11 +101,6 @@ public class SparkRemoveBlacklistedRelationJob { return c._1(); }, Encoders.bean(Relation.class)); - dedupBL - .write() - .mode(SaveMode.Overwrite) - .json(blacklistPath + "/deduped"); - inputRelation .joinWith( dedupBL, (inputRelation From 7a89507ab1b1f347cdcfb46ffa8a908072aed057 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 15:16:54 +0200 Subject: [PATCH 19/23] code formatting --- .../dhp/oa/graph/raw/OafToOafMapper.java | 6 ++- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 51 +++++++++++-------- 2 files changed, 34 insertions(+), 23 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index 6b6aa15e8..54594cb80 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -7,7 +7,6 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import java.util.*; import java.util.stream.Collectors; -import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.Element; @@ -16,6 +15,7 @@ import org.dom4j.Node; import com.google.common.collect.Lists; import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; public class OafToOafMapper extends AbstractMdRecordToOafMapper { @@ -44,7 +44,9 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { if (StringUtils.isNotBlank(pid) && StringUtils.isNotBlank(pidType)) { author.setPid(new ArrayList<>()); - author.getPid().add(structuredProperty(pid, qualifier(pidType, pidType, DNET_PID_TYPES, DNET_PID_TYPES), info)); + author + .getPid() + .add(structuredProperty(pid, qualifier(pidType, pidType, DNET_PID_TYPES, DNET_PID_TYPES), info)); } res.add(author); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index d7635c9ea..5a006e351 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -12,8 +12,6 @@ import java.util.List; import java.util.Map; import java.util.Optional; -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.*; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.BeforeEach; @@ -22,6 +20,9 @@ import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.*; + @ExtendWith(MockitoExtension.class) public class MappersTest { @@ -53,15 +54,18 @@ public class MappersTest { assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); assertTrue(p.getAuthor().size() > 0); - Optional author = p.getAuthor() - .stream() - .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) - .findFirst(); + Optional author = p + .getAuthor() + .stream() + .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) + .findFirst(); assertTrue(author.isPresent()); - StructuredProperty pid = author.get().getPid() - .stream() - .findFirst() - .get(); + StructuredProperty pid = author + .get() + .getPid() + .stream() + .findFirst() + .get(); assertEquals("0000-0001-6651-1178", pid.getValue()); assertEquals("ORCID", pid.getQualifier().getClassid()); assertEquals("ORCID", pid.getQualifier().getClassname()); @@ -117,15 +121,18 @@ public class MappersTest { assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue())); assertTrue(d.getAuthor().size() > 0); - Optional author = d.getAuthor() - .stream() - .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) - .findFirst(); + Optional author = d + .getAuthor() + .stream() + .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) + .findFirst(); assertTrue(author.isPresent()); - StructuredProperty pid = author.get().getPid() - .stream() - .findFirst() - .get(); + StructuredProperty pid = author + .get() + .getPid() + .stream() + .findFirst() + .get(); assertEquals("0000-0001-9074-1619", pid.getValue()); assertEquals("ORCID", pid.getQualifier().getClassid()); assertEquals("ORCID", pid.getQualifier().getClassname()); @@ -136,9 +143,11 @@ public class MappersTest { assertEquals("Theo", author.get().getName()); assertEquals(1, author.get().getAffiliation().size()); - Optional> opAff = author.get().getAffiliation() - .stream() - .findFirst(); + Optional> opAff = author + .get() + .getAffiliation() + .stream() + .findFirst(); assertTrue(opAff.isPresent()); Field affiliation = opAff.get(); assertEquals("ISTI-CNR", affiliation.getValue()); From 82b615ab33be893b8527c0629b6a1e228fd80393 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 16:04:46 +0200 Subject: [PATCH 20/23] NPE check --- .../src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java index d3ae8ee4f..a6ec364c4 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java @@ -129,6 +129,9 @@ public class DedupUtility { .max(Comparator.comparing(Tuple2::_1)); if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) { Author r = simAuhtor.get()._2(); + if (r.getPid() == null) { + r.setPid(new ArrayList<>()); + } r.getPid().add(a._1()); } }); From 7838f2c63fb99e956aaf4bed024363b0c3f54178 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 17:06:01 +0200 Subject: [PATCH 21/23] init the empty list for author pids mapped from OAF --- .../main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index 54594cb80..ed09016da 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -42,8 +42,8 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final String pid = e.attributeValue("nameIdentifier"); final String pidType = e.attributeValue("nameIdentifierScheme"); + author.setPid(new ArrayList<>()); if (StringUtils.isNotBlank(pid) && StringUtils.isNotBlank(pidType)) { - author.setPid(new ArrayList<>()); author .getPid() .add(structuredProperty(pid, qualifier(pidType, pidType, DNET_PID_TYPES, DNET_PID_TYPES), info)); From ef9a9a9f1af74f49fe0d4dda802d33790a84c4a6 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 22:34:19 +0200 Subject: [PATCH 22/23] remove the outout path when starting --- .../PrepareResultCommunitySetStep2.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java index 723aa8960..09340369d 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java @@ -53,9 +53,7 @@ public class PrepareResultCommunitySetStep2 { conf, isSparkSessionManaged, spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } + removeOutputDir(spark, outputPath); mergeInfo(spark, inputPath, outputPath); }); } From b71fbb68b16cb1515614f16237d8fa8196871d29 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 18 May 2020 13:57:20 +0200 Subject: [PATCH 23/23] removed the removeOutputDir command from code. Reltions are written in Append. The erase of the output dir ment to remove all the relations computed in the prevoius steps --- .../SparkResultToOrganizationFromIstRepoJob.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java index 0ce741b87..0c5e1d8be 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java @@ -84,7 +84,7 @@ public class SparkResultToOrganizationFromIstRepoJob { conf, isSparkSessionManaged, spark -> { - removeOutputDir(spark, outputPath); + // removeOutputDir(spark, outputPath); if (saveGraph) { execPropagation( spark,