From c0265213a0b2119a30f00ec0013c6c88de3b826a Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Wed, 13 May 2020 12:00:27 +0200 Subject: [PATCH 01/24] partial implementation --- .../dhp/broker/model/EventFactory.java | 31 ++++----- .../eu/dnetlib/dhp/broker/model/Topic.java | 52 +++++++++++++++ .../broker/oa/GenerateEventsApplication.java | 55 +++++++--------- .../broker/oa/util/EnrichMissingAbstract.java | 30 +++++---- .../oa/util/EnrichMissingAuthorOrcid.java | 32 +++++----- .../oa/util/EnrichMissingOpenAccess.java | 31 ++++----- .../dhp/broker/oa/util/EnrichMissingPid.java | 28 ++++----- .../broker/oa/util/EnrichMissingProject.java | 30 ++++----- .../oa/util/EnrichMissingPublicationDate.java | 29 ++++----- .../broker/oa/util/EnrichMissingSubject.java | 30 +++++---- .../broker/oa/util/EnrichMoreOpenAccess.java | 29 ++++----- .../dhp/broker/oa/util/EnrichMorePid.java | 28 ++++----- .../dhp/broker/oa/util/EnrichMoreSubject.java | 30 +++++---- .../dhp/broker/oa/util/UpdateInfo.java | 59 ++++++++++++++--- .../dhp/broker/oa/util/UpdateMatcher.java | 63 +++++++++++++++++++ 15 files changed, 363 insertions(+), 194 deletions(-) create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateMatcher.java diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java index 0694556b2e..9e5d986448 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java @@ -29,31 +29,32 @@ public class EventFactory { "yyyy-MM-dd" }; - public static Event newBrokerEvent(final Result source, final Result target, final UpdateInfo updateInfo) { + public static Event newBrokerEvent(final UpdateInfo updateInfo) { final long now = new Date().getTime(); final Event res = new Event(); - final Map map = createMapFromResult(target, source, updateInfo); + final Map map = createMapFromResult(updateInfo); - final String payload = createPayload(target, updateInfo); + final String payload = createPayload(updateInfo); final String eventId = calculateEventId( - updateInfo.getTopic(), target.getOriginalId().get(0), updateInfo.getHighlightValueAsString()); + updateInfo.getTopicPath(), updateInfo.getTarget().getOriginalId().get(0), + updateInfo.getHighlightValueAsString()); res.setEventId(eventId); res.setProducerId(PRODUCER_ID); res.setPayload(payload); res.setMap(map); - res.setTopic(updateInfo.getTopic()); + res.setTopic(updateInfo.getTopicPath()); res.setCreationDate(now); res.setExpiryDate(calculateExpiryDate(now)); res.setInstantMessage(false); return res; } - private static String createPayload(final Result result, final UpdateInfo updateInfo) { + private static String createPayload(final UpdateInfo updateInfo) { final OpenAireEventPayload payload = new OpenAireEventPayload(); // TODO @@ -62,32 +63,34 @@ public class EventFactory { return payload.toJSON(); } - private static Map createMapFromResult(final Result oaf, final Result source, - final UpdateInfo updateInfo) { + private static Map createMapFromResult(final UpdateInfo updateInfo) { final Map map = new HashMap<>(); - final List collectedFrom = oaf.getCollectedfrom(); + final Result source = updateInfo.getSource(); + final Result target = updateInfo.getTarget(); + + final List collectedFrom = target.getCollectedfrom(); if (collectedFrom.size() == 1) { map.put("target_datasource_id", collectedFrom.get(0).getKey()); map.put("target_datasource_name", collectedFrom.get(0).getValue()); } - final List ids = oaf.getOriginalId(); + final List ids = target.getOriginalId(); if (ids.size() > 0) { map.put("target_publication_id", ids.get(0)); } - final List titles = oaf.getTitle(); + final List titles = target.getTitle(); if (titles.size() > 0) { map.put("target_publication_title", titles.get(0)); } - final long date = parseDateTolong(oaf.getDateofacceptance().getValue()); + final long date = parseDateTolong(target.getDateofacceptance().getValue()); if (date > 0) { map.put("target_dateofacceptance", date); } - final List subjects = oaf.getSubject(); + final List subjects = target.getSubject(); if (subjects.size() > 0) { map .put( @@ -95,7 +98,7 @@ public class EventFactory { subjects.stream().map(StructuredProperty::getValue).collect(Collectors.toList())); } - final List authors = oaf.getAuthor(); + final List authors = target.getAuthor(); if (authors.size() > 0) { map .put( diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java new file mode 100644 index 0000000000..29f6cbe3af --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java @@ -0,0 +1,52 @@ + +package eu.dnetlib.dhp.broker.model; + +public enum Topic { + + // ENRICHMENT MISSING + ENRICH_MISSING_OA_VERSION("ENRICH/MISSING/OPENACCESS_VERSION"), ENRICH_MISSING_ABSTRACT( + "ENRICH/MISSING/ABSTRACT"), ENRICH_MISSING_PUBLICATION_DATE( + "ENRICH/MISSING/PUBLICATION_DATE"), ENRICH_MISSING_PID( + "ENRICH/MISSING/PID"), ENRICH_MISSING_PROJECT("ENRICH/MISSING/PROJECT"), ENRICH_MISSING_SOFTWARE( + "ENRICH/MISSING/SOFTWARE"), ENRICH_MISSING_SUBJECT_MESHEUROPMC( + "ENRICH/MISSING/SUBJECT/MESHEUROPMC"), ENRICH_MISSING_SUBJECT_ARXIV( + "ENRICH/MISSING/SUBJECT/ARXIV"), ENRICH_MISSING_SUBJECT_JEL( + "ENRICH/MISSING/SUBJECT/JEL"), ENRICH_MISSING_SUBJECT_DDC( + "ENRICH/MISSING/SUBJECT/DDC"), ENRICH_MISSING_SUBJECT_ACM( + "ENRICH/MISSING/SUBJECT/ACM"), ENRICH_MISSING_SUBJECT_RVK( + "ENRICH/MISSING/SUBJECT/RVK"), ENRICH_MISSING_AUTHOR_ORCID( + "ENRICH/MISSING/AUTHOR/ORCID"), + + // ENRICHMENT MORE + ENRICH_MORE_PID("ENRICH/MORE/PID"), ENRICH_MORE_OA_VERSION("ENRICH/MORE/OPENACCESS_VERSION"), ENRICH_MORE_ABSTRACT( + "ENRICH/MORE/ABSTRACT"), ENRICH_MORE_PUBLICATION_DATE("ENRICH/MORE/PUBLICATION_DATE"), ENRICH_MORE_PROJECT( + "ENRICH/MORE/PROJECT"), ENRICH_MORE_SUBJECT_MESHEUROPMC( + "ENRICH/MORE/SUBJECT/MESHEUROPMC"), ENRICH_MORE_SUBJECT_ARXIV( + "ENRICH/MORE/SUBJECT/ARXIV"), ENRICH_MORE_SUBJECT_JEL( + "ENRICH/MORE/SUBJECT/JEL"), ENRICH_MORE_SUBJECT_DDC( + "ENRICH/MORE/SUBJECT/DDC"), ENRICH_MORE_SUBJECT_ACM( + "ENRICH/MORE/SUBJECT/ACM"), ENRICH_MORE_SUBJECT_RVK("ENRICH/MORE/SUBJECT/RVK"), + + // ADDITION + ADD_BY_PROJECT("ADD/BY_PROJECT"); + + Topic(final String path) { + this.path = path; + } + + protected String path; + + public String getPath() { + return this.path; + } + + public static Topic fromPath(final String path) { + for (final Topic t : Topic.values()) { + if (t.getPath().equals(path)) { + return t; + } + } + return null; + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java index 54d4ef36aa..c4c167c13f 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java @@ -14,8 +14,6 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.databind.ObjectMapper; - import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.broker.model.Event; import eu.dnetlib.dhp.broker.model.EventFactory; @@ -30,6 +28,7 @@ import eu.dnetlib.dhp.broker.oa.util.EnrichMoreOpenAccess; import eu.dnetlib.dhp.broker.oa.util.EnrichMorePid; import eu.dnetlib.dhp.broker.oa.util.EnrichMoreSubject; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.broker.oa.util.UpdateMatcher; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.oaf.Result; @@ -37,7 +36,16 @@ public class GenerateEventsApplication { private static final Logger log = LoggerFactory.getLogger(GenerateEventsApplication.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final UpdateMatcher enrichMissingAbstract = new EnrichMissingAbstract(); + private static final UpdateMatcher enrichMissingAuthorOrcid = new EnrichMissingAuthorOrcid(); + private static final UpdateMatcher enrichMissingOpenAccess = new EnrichMissingOpenAccess(); + private static final UpdateMatcher enrichMissingPid = new EnrichMissingPid(); + private static final UpdateMatcher enrichMissingProject = new EnrichMissingProject(); + private static final UpdateMatcher enrichMissingPublicationDate = new EnrichMissingPublicationDate(); + private static final UpdateMatcher enrichMissingSubject = new EnrichMissingSubject(); + private static final UpdateMatcher enrichMoreOpenAccess = new EnrichMoreOpenAccess(); + private static final UpdateMatcher enrichMorePid = new EnrichMorePid(); + private static final UpdateMatcher enrichMoreSubject = new EnrichMoreSubject(); public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( @@ -76,37 +84,22 @@ public class GenerateEventsApplication { } private List generateEvents(final Result... children) { - final List list = new ArrayList<>(); + final List> list = new ArrayList<>(); - for (final Result source : children) { - for (final Result target : children) { - if (source != target) { - list - .addAll( - findUpdates(source, target) - .stream() - .map(info -> EventFactory.newBrokerEvent(source, target, info)) - .collect(Collectors.toList())); - } - } + for (final Result target : children) { + list.addAll(enrichMissingAbstract.searchUpdatesForRecord(target, children)); + list.addAll(enrichMissingAuthorOrcid.searchUpdatesForRecord(target, children)); + list.addAll(enrichMissingOpenAccess.searchUpdatesForRecord(target, children)); + list.addAll(enrichMissingPid.searchUpdatesForRecord(target, children)); + list.addAll(enrichMissingProject.searchUpdatesForRecord(target, children)); + list.addAll(enrichMissingPublicationDate.searchUpdatesForRecord(target, children)); + list.addAll(enrichMissingSubject.searchUpdatesForRecord(target, children)); + list.addAll(enrichMoreOpenAccess.searchUpdatesForRecord(target, children)); + list.addAll(enrichMorePid.searchUpdatesForRecord(target, children)); + list.addAll(enrichMoreSubject.searchUpdatesForRecord(target, children)); } - return list; - } - - private List> findUpdates(final Result source, final Result target) { - final List> list = new ArrayList<>(); - list.addAll(EnrichMissingAbstract.findUpdates(source, target)); - list.addAll(EnrichMissingAuthorOrcid.findUpdates(source, target)); - list.addAll(EnrichMissingOpenAccess.findUpdates(source, target)); - list.addAll(EnrichMissingPid.findUpdates(source, target)); - list.addAll(EnrichMissingProject.findUpdates(source, target)); - list.addAll(EnrichMissingPublicationDate.findUpdates(source, target)); - list.addAll(EnrichMissingSubject.findUpdates(source, target)); - list.addAll(EnrichMoreOpenAccess.findUpdates(source, target)); - list.addAll(EnrichMorePid.findUpdates(source, target)); - list.addAll(EnrichMoreSubject.findUpdates(source, target)); - return list; + return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList()); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAbstract.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAbstract.java index 493d1f97c7..6b6e35d1d6 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAbstract.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAbstract.java @@ -1,31 +1,35 @@ package eu.dnetlib.dhp.broker.oa.util; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.OpenAireEventPayload; +import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingAbstract extends UpdateInfo { +public class EnrichMissingAbstract extends UpdateMatcher { - public static List findUpdates(final Result source, final Result target) { - // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); - return Arrays.asList(); - } - - private EnrichMissingAbstract(final String highlightValue, final float trust) { - super("ENRICH/MISSING/ABSTRACT", highlightValue, trust); + public EnrichMissingAbstract() { + super(false); } @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getAbstracts().add(getHighlightValue()); + protected List> findUpdates(final Result source, final Result target) { + if (isMissing(target.getDescription()) && !isMissing(source.getDescription())) { + return Arrays.asList(generateUpdateInfo(source.getDescription().get(0).getValue(), source, target)); + } + return new ArrayList<>(); } @Override - public String getHighlightValueAsString() { - return getHighlightValue(); + public UpdateInfo generateUpdateInfo(final String highlightValue, final Result source, + final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_ABSTRACT, + highlightValue, source, target, + (p, s) -> p.getAbstracts().add(s), + s -> s); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAuthorOrcid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAuthorOrcid.java index 6899c62a37..d81427e05c 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAuthorOrcid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAuthorOrcid.java @@ -4,28 +4,30 @@ package eu.dnetlib.dhp.broker.oa.util; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.OpenAireEventPayload; +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingAuthorOrcid extends UpdateInfo { +public class EnrichMissingAuthorOrcid extends UpdateMatcher> { - public static List findUpdates(final Result source, final Result target) { + public EnrichMissingAuthorOrcid() { + super(true); + } + + @Override + protected List>> findUpdates(final Result source, final Result target) { // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); return Arrays.asList(); } - private EnrichMissingAuthorOrcid(final String highlightValue, final float trust) { - super("ENRICH/MISSING/AUTHOR/ORCID", highlightValue, trust); - } - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - // TODO + public UpdateInfo> generateUpdateInfo(final Pair highlightValue, + final Result source, final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_AUTHOR_ORCID, + highlightValue, source, target, + (p, pair) -> p.getCreators().add(pair.getLeft() + " - ORCID: " + pair.getRight()), + pair -> pair.getLeft() + "::" + pair.getRight()); } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue(); - } - } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingOpenAccess.java index 9464130f31..9079ee24b0 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingOpenAccess.java @@ -5,28 +5,29 @@ import java.util.Arrays; import java.util.List; import eu.dnetlib.broker.objects.Instance; -import eu.dnetlib.broker.objects.OpenAireEventPayload; +import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingOpenAccess extends UpdateInfo { +public class EnrichMissingOpenAccess extends UpdateMatcher { + + public EnrichMissingOpenAccess() { + super(true); + } + + @Override + protected List> findUpdates(final Result source, final Result target) { - public static List findUpdates(final Result source, final Result target) { - // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); return Arrays.asList(); } - private EnrichMissingOpenAccess(final Instance highlightValue, final float trust) { - super("ENRICH/MISSING/OPENACCESS_VERSION", highlightValue, trust); - } - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getInstances().add(getHighlightValue()); - } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue().getUrl(); + public UpdateInfo generateUpdateInfo(final Instance highlightValue, final Result source, + final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_OA_VERSION, + highlightValue, source, target, + (p, i) -> p.getInstances().add(i), + Instance::getUrl); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPid.java index 293d4993f3..0b4045a0e4 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPid.java @@ -4,29 +4,29 @@ package eu.dnetlib.dhp.broker.oa.util; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.OpenAireEventPayload; import eu.dnetlib.broker.objects.Pid; +import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingPid extends UpdateInfo { +public class EnrichMissingPid extends UpdateMatcher { - public static List findUpdates(final Result source, final Result target) { + public EnrichMissingPid() { + super(true); + } + + @Override + protected List> findUpdates(final Result source, final Result target) { // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); return Arrays.asList(); } - private EnrichMissingPid(final Pid highlightValue, final float trust) { - super("ENRICH/MISSING/PID", highlightValue, trust); - } - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getPids().add(getHighlightValue()); - } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue().getType() + "::" + getHighlightValue().getValue(); + public UpdateInfo generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_PID, + highlightValue, source, target, + (p, pid) -> p.getPids().add(pid), + pid -> pid.getType() + "::" + pid.getValue()); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingProject.java index a22c179a20..45b16801c2 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingProject.java @@ -4,30 +4,30 @@ package eu.dnetlib.dhp.broker.oa.util; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.OpenAireEventPayload; import eu.dnetlib.broker.objects.Project; +import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingProject extends UpdateInfo { +public class EnrichMissingProject extends UpdateMatcher { - public static List findUpdates(final Result source, final Result target) { + public EnrichMissingProject() { + super(true); + } + + @Override + protected List> findUpdates(final Result source, final Result target) { // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); return Arrays.asList(); } - private EnrichMissingProject(final Project highlightValue, final float trust) { - super("ENRICH/MISSING/PROJECT", highlightValue, trust); - } - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getProjects().add(getHighlightValue()); - } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue().getFunder() + "::" + getHighlightValue().getFundingProgram() - + getHighlightValue().getCode(); + public UpdateInfo generateUpdateInfo(final Project highlightValue, final Result source, + final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_PROJECT, + highlightValue, source, target, + (p, prj) -> p.getProjects().add(prj), + prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode()); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPublicationDate.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPublicationDate.java index 869dca2645..7fcd2a66f9 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPublicationDate.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPublicationDate.java @@ -4,28 +4,29 @@ package eu.dnetlib.dhp.broker.oa.util; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.OpenAireEventPayload; +import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingPublicationDate extends UpdateInfo { +public class EnrichMissingPublicationDate extends UpdateMatcher { - public static List findUpdates(final Result source, final Result target) { + public EnrichMissingPublicationDate() { + super(false); + } + + @Override + protected List> findUpdates(final Result source, final Result target) { // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); return Arrays.asList(); } - private EnrichMissingPublicationDate(final String highlightValue, final float trust) { - super("ENRICH/MISSING/PUBLICATION_DATE", highlightValue, trust); - } - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().setPublicationdate(getHighlightValue()); - } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue(); + public UpdateInfo generateUpdateInfo(final String highlightValue, final Result source, + final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_PUBLICATION_DATE, + highlightValue, source, target, + (p, date) -> p.setPublicationdate(date), + s -> s); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingSubject.java index a2ed5d0439..4470bd9d98 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingSubject.java @@ -4,12 +4,19 @@ package eu.dnetlib.dhp.broker.oa.util; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.OpenAireEventPayload; +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingSubject extends UpdateInfo { +public class EnrichMissingSubject extends UpdateMatcher> { - public static List findUpdates(final Result source, final Result target) { + public EnrichMissingSubject() { + super(true); + } + + @Override + protected List>> findUpdates(final Result source, final Result target) { // MESHEUROPMC // ARXIV // JEL @@ -19,18 +26,15 @@ public class EnrichMissingSubject extends UpdateInfo { return Arrays.asList(); } - private EnrichMissingSubject(final String subjectClassification, final String highlightValue, final float trust) { - super("ENRICH/MISSING/SUBJECT/" + subjectClassification, highlightValue, trust); - } - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getSubjects().add(getHighlightValue()); - } + public UpdateInfo> generateUpdateInfo(final Pair highlightValue, + final Result source, final Result target) { - @Override - public String getHighlightValueAsString() { - return getHighlightValue(); + return new UpdateInfo<>( + Topic.fromPath("ENRICH/MISSING/SUBJECT/" + highlightValue.getLeft()), + highlightValue, source, target, + (p, pair) -> p.getSubjects().add(pair.getRight()), + pair -> pair.getLeft() + "::" + pair.getRight()); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreOpenAccess.java index 4f1e88d3d5..bc37ce659a 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreOpenAccess.java @@ -5,28 +5,29 @@ import java.util.Arrays; import java.util.List; import eu.dnetlib.broker.objects.Instance; -import eu.dnetlib.broker.objects.OpenAireEventPayload; +import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMoreOpenAccess extends UpdateInfo { +public class EnrichMoreOpenAccess extends UpdateMatcher { - public static List findUpdates(final Result source, final Result target) { + public EnrichMoreOpenAccess() { + super(true); + } + + @Override + protected List> findUpdates(final Result source, final Result target) { // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); return Arrays.asList(); } - private EnrichMoreOpenAccess(final Instance highlightValue, final float trust) { - super("ENRICH/MORE/OPENACCESS_VERSION", highlightValue, trust); - } - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getInstances().add(getHighlightValue()); - } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue().getUrl(); + public UpdateInfo generateUpdateInfo(final Instance highlightValue, final Result source, + final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MORE_OA_VERSION, + highlightValue, source, target, + (p, i) -> p.getInstances().add(i), + Instance::getUrl); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMorePid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMorePid.java index ecf2cf3107..8cd67f5536 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMorePid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMorePid.java @@ -4,29 +4,29 @@ package eu.dnetlib.dhp.broker.oa.util; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.OpenAireEventPayload; import eu.dnetlib.broker.objects.Pid; +import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMorePid extends UpdateInfo { +public class EnrichMorePid extends UpdateMatcher { - public static List findUpdates(final Result source, final Result target) { + public EnrichMorePid() { + super(true); + } + + @Override + protected List> findUpdates(final Result source, final Result target) { // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); return Arrays.asList(); } - private EnrichMorePid(final Pid highlightValue, final float trust) { - super("ENRICH/MORE/PID", highlightValue, trust); - } - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getPids().add(getHighlightValue()); - } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue().getType() + "::" + getHighlightValue().getValue(); + public UpdateInfo generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MORE_PID, + highlightValue, source, target, + (p, pid) -> p.getPids().add(pid), + pid -> pid.getType() + "::" + pid.getValue()); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreSubject.java index f29b86292d..9e0d8e6939 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreSubject.java @@ -4,12 +4,19 @@ package eu.dnetlib.dhp.broker.oa.util; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.OpenAireEventPayload; +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMoreSubject extends UpdateInfo { +public class EnrichMoreSubject extends UpdateMatcher> { - public static List findUpdates(final Result source, final Result target) { + public EnrichMoreSubject() { + super(true); + } + + @Override + protected List>> findUpdates(final Result source, final Result target) { // MESHEUROPMC // ARXIV // JEL @@ -19,18 +26,15 @@ public class EnrichMoreSubject extends UpdateInfo { return Arrays.asList(); } - private EnrichMoreSubject(final String subjectClassification, final String highlightValue, final float trust) { - super("ENRICH/MORE/SUBJECT/" + subjectClassification, highlightValue, trust); - } - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getSubjects().add(getHighlightValue()); - } + public UpdateInfo> generateUpdateInfo(final Pair highlightValue, + final Result source, final Result target) { - @Override - public String getHighlightValueAsString() { - return getHighlightValue(); + return new UpdateInfo<>( + Topic.fromPath("ENRICH/MORE/SUBJECT/" + highlightValue.getLeft()), + highlightValue, source, target, + (p, pair) -> p.getSubjects().add(pair.getRight()), + pair -> pair.getLeft() + "::" + pair.getRight()); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java index f7b6b69e9e..1dfc14e5eb 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java @@ -1,36 +1,77 @@ package eu.dnetlib.dhp.broker.oa.util; +import java.util.function.BiConsumer; +import java.util.function.Function; + import eu.dnetlib.broker.objects.OpenAireEventPayload; +import eu.dnetlib.broker.objects.Publication; +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.schema.oaf.Result; -public abstract class UpdateInfo { +public final class UpdateInfo { - private final String topic; + private final Topic topic; private final T highlightValue; + private final Result source; + + private final Result target; + + private final BiConsumer compileHighlight; + + private final Function highlightToString; + private final float trust; - protected UpdateInfo(final String topic, final T highlightValue, final float trust) { + protected UpdateInfo(final Topic topic, final T highlightValue, final Result source, final Result target, + final BiConsumer compileHighlight, + final Function highlightToString) { this.topic = topic; this.highlightValue = highlightValue; - this.trust = trust; + this.source = source; + this.target = target; + this.compileHighlight = compileHighlight; + this.highlightToString = highlightToString; + this.trust = calculateTrust(source, target); } public T getHighlightValue() { return highlightValue; } + public Result getSource() { + return source; + } + + public Result getTarget() { + return target; + } + + private float calculateTrust(final Result source, final Result target) { + // TODO + return 0.9f; + } + + protected Topic getTopic() { + return topic; + } + + public String getTopicPath() { + return topic.getPath(); + } + public float getTrust() { return trust; } - public String getTopic() { - return topic; + public void compileHighlight(final OpenAireEventPayload payload) { + compileHighlight.accept(payload.getHighlight(), getHighlightValue()); } - abstract public void compileHighlight(OpenAireEventPayload payload); - - abstract public String getHighlightValueAsString(); + public String getHighlightValueAsString() { + return highlightToString.apply(getHighlightValue()); + } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateMatcher.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateMatcher.java new file mode 100644 index 0000000000..3fd6d40276 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateMatcher.java @@ -0,0 +1,63 @@ + +package eu.dnetlib.dhp.broker.oa.util; + +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.lang3.StringUtils; + +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.Result; + +public abstract class UpdateMatcher { + + private final boolean multipleUpdate; + + public UpdateMatcher(final boolean multipleUpdate) { + this.multipleUpdate = multipleUpdate; + } + + public Collection> searchUpdatesForRecord(final Result res, final Result... others) { + + final Map> infoMap = new HashMap<>(); + + for (final Result source : others) { + if (source != res) { + for (final UpdateInfo info : findUpdates(source, res)) { + final String s = DigestUtils.md5Hex(info.getHighlightValueAsString()); + if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) { + } else { + infoMap.put(s, info); + } + } + } + } + + final Collection> values = infoMap.values(); + + if (values.isEmpty() || multipleUpdate) { + return values; + } else { + final UpdateInfo v = values + .stream() + .sorted((o1, o2) -> Float.compare(o1.getTrust(), o2.getTrust())) + .findFirst() + .get(); + return Arrays.asList(v); + } + } + + protected abstract List> findUpdates(Result source, Result target); + + protected abstract UpdateInfo generateUpdateInfo(final T highlightValue, final Result source, + final Result target); + + protected static boolean isMissing(final List> list) { + return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0).getValue()); + } + +} From 5ecacad70a0539bd94e336ad604cc32ac8e0ce1a Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 13 May 2020 17:01:11 +0200 Subject: [PATCH 02/24] fixed default resource typing in Oaf/Odf mapping --- .../raw/AbstractMdRecordToOafMapper.java | 168 +++++++++--------- 1 file changed, 84 insertions(+), 84 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index fd12716b42..be0b910226 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -64,7 +64,7 @@ public abstract class AbstractMdRecordToOafMapper { } protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier( - "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); + "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); protected AbstractMdRecordToOafMapper(final Map code2name) { this.code2name = code2name; @@ -75,20 +75,20 @@ public abstract class AbstractMdRecordToOafMapper { DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); final Document doc = DocumentHelper - .parseText( - xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)); + .parseText( + xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)); final String type = doc.valueOf("//dr:CobjCategory/@type"); final KeyValue collectedFrom = getProvenanceDatasource( - doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); + doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); if (collectedFrom == null) { return null; } final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) - ? collectedFrom - : getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name"); + ? collectedFrom + : getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name"); if (hostedBy == null) { return null; @@ -112,22 +112,21 @@ public abstract class AbstractMdRecordToOafMapper { } return keyValue( - createOpenaireId(10, dsId, true), - dsName); + createOpenaireId(10, dsId, true), + dsName); } protected List createOafs( - final Document doc, - final String type, - final KeyValue collectedFrom, - final KeyValue hostedBy, - final DataInfo info, - final long lastUpdateTimestamp) { + final Document doc, + final String type, + final KeyValue collectedFrom, + final KeyValue hostedBy, + final DataInfo info, + final long lastUpdateTimestamp) { final List oafs = new ArrayList<>(); switch (type.toLowerCase()) { - case "": case "publication": final Publication p = new Publication(); populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); @@ -138,7 +137,7 @@ public abstract class AbstractMdRecordToOafMapper { case "dataset": final Dataset d = new Dataset(); populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - d.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE); + d.setResulttype(DATASET_DEFAULT_RESULTTYPE); d.setStoragedate(prepareDatasetStorageDate(doc, info)); d.setDevice(prepareDatasetDevice(doc, info)); d.setSize(prepareDatasetSize(doc, info)); @@ -158,6 +157,7 @@ public abstract class AbstractMdRecordToOafMapper { s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); oafs.add(s); break; + case "": case "otherresearchproducts": default: final OtherResearchProduct o = new OtherResearchProduct(); @@ -179,10 +179,10 @@ public abstract class AbstractMdRecordToOafMapper { } private List addProjectRels( - final Document doc, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { + final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { final List res = new ArrayList<>(); @@ -196,15 +196,15 @@ public abstract class AbstractMdRecordToOafMapper { final String projectId = createOpenaireId(40, originalId, true); res - .add( - getRelation( - docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info, - lastUpdateTimestamp)); + .add( + getRelation( + docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info, + lastUpdateTimestamp)); res - .add( - getRelation( - projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info, - lastUpdateTimestamp)); + .add( + getRelation( + projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info, + lastUpdateTimestamp)); } } @@ -212,7 +212,7 @@ public abstract class AbstractMdRecordToOafMapper { } protected Relation getRelation(String source, String target, String relType, String subRelType, String relClass, - KeyValue collectedFrom, DataInfo info, long lastUpdateTimestamp) { + KeyValue collectedFrom, DataInfo info, long lastUpdateTimestamp) { final Relation rel = new Relation(); rel.setRelType(relType); rel.setSubRelType(subRelType); @@ -226,27 +226,27 @@ public abstract class AbstractMdRecordToOafMapper { } protected abstract List addOtherResultRels( - final Document doc, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp); + final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp); private void populateResultFields( - final Result r, - final Document doc, - final KeyValue collectedFrom, - final KeyValue hostedBy, - final DataInfo info, - final long lastUpdateTimestamp) { + final Result r, + final Document doc, + final KeyValue collectedFrom, + final KeyValue hostedBy, + final DataInfo info, + final long lastUpdateTimestamp) { r.setDataInfo(info); r.setLastupdatetimestamp(lastUpdateTimestamp); r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); r.setCollectedfrom(Arrays.asList(collectedFrom)); r - .setPid( - prepareListStructProps( - doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); + .setPid( + prepareListStructProps( + doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation")); r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES @@ -289,7 +289,7 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); protected abstract List prepareInstances( - Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); + Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); protected abstract List> prepareSources(Document doc, DataInfo info); @@ -314,13 +314,13 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract List prepareAuthors(Document doc, DataInfo info); protected abstract List> prepareOtherResearchProductTools( - Document doc, DataInfo info); + Document doc, DataInfo info); protected abstract List> prepareOtherResearchProductContactGroups( - Document doc, DataInfo info); + Document doc, DataInfo info); protected abstract List> prepareOtherResearchProductContactPersons( - Document doc, DataInfo info); + Document doc, DataInfo info); protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info); @@ -329,7 +329,7 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract List prepareSoftwareLicenses(Document doc, DataInfo info); protected abstract List> prepareSoftwareDocumentationUrls( - Document doc, DataInfo info); + Document doc, DataInfo info); protected abstract List prepareDatasetGeoLocations(Document doc, DataInfo info); @@ -359,37 +359,37 @@ public abstract class AbstractMdRecordToOafMapper { final String edition = n.valueOf("@edition"); if (StringUtils.isNotBlank(name)) { return journal( - name, - issnPrinted, - issnOnline, - issnLinking, - ep, - iss, - sp, - vol, - edition, - null, - null, - info); + name, + issnPrinted, + issnOnline, + issnLinking, + ep, + iss, + sp, + vol, + edition, + null, + null, + info); } } return null; } protected Qualifier prepareQualifier( - final Node node, final String xpath, final String schemeId, final String schemeName) { + final Node node, final String xpath, final String schemeId, final String schemeName) { final String classId = node.valueOf(xpath); final String className = code2name.get(classId); return qualifier(classId, className, schemeId, schemeName); } protected List prepareListStructProps( - final Node node, - final String xpath, - final String xpathClassId, - final String schemeId, - final String schemeName, - final DataInfo info) { + final Node node, + final String xpath, + final String xpathClassId, + final String schemeId, + final String schemeName, + final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; @@ -401,7 +401,7 @@ public abstract class AbstractMdRecordToOafMapper { } protected List prepareListStructProps( - final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) { + final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; @@ -411,19 +411,19 @@ public abstract class AbstractMdRecordToOafMapper { } protected List prepareListStructProps( - final Node node, final String xpath, final DataInfo info) { + final Node node, final String xpath, final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; res - .add( - structuredProperty( - n.getText(), - n.valueOf("@classid"), - n.valueOf("@classname"), - n.valueOf("@schemeid"), - n.valueOf("@schemename"), - info)); + .add( + structuredProperty( + n.getText(), + n.valueOf("@classid"), + n.valueOf("@classname"), + n.valueOf("@schemeid"), + n.valueOf("@schemename"), + info)); } return res; } @@ -450,7 +450,7 @@ public abstract class AbstractMdRecordToOafMapper { if (n == null) { return dataInfo( - false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); + false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); } final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); @@ -464,12 +464,12 @@ public abstract class AbstractMdRecordToOafMapper { final String trust = n.valueOf("./oaf:trust"); return dataInfo( - deletedbyinference, - inferenceprovenance, - inferred, - false, - qualifier(paClassId, paClassName, paSchemeId, paSchemeName), - trust); + deletedbyinference, + inferenceprovenance, + inferred, + false, + qualifier(paClassId, paClassName, paSchemeId, paSchemeName), + trust); } protected Field prepareField(final Node node, final String xpath, final DataInfo info) { @@ -477,7 +477,7 @@ public abstract class AbstractMdRecordToOafMapper { } protected List> prepareListFields( - final Node node, final String xpath, final DataInfo info) { + final Node node, final String xpath, final DataInfo info) { return listFields(info, prepareListString(node, xpath)); } From ab37953332755ed53ebf95655547a4736d8f7395 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 14 May 2020 10:25:41 +0200 Subject: [PATCH 03/24] added global properties in wf definitions to avoid repeating name-node and job-tracker in the (many) distcp actions; reintroduced output directory removal at the beginning of each spark action --- .../dnetlib/dhp/bulktag/SparkBulkTagJob.java | 2 + .../SparkCountryPropagationJob.java | 17 +- .../PrepareResultOrcidAssociationStep1.java | 4 +- .../PrepareResultOrcidAssociationStep2.java | 4 +- .../SparkOrcidToResultFromSemRelJob.java | 7 +- .../PrepareProjectResultsAssociation.java | 2 + .../PrepareResultCommunitySet.java | 4 +- ...kResultToCommunityFromOrganizationJob.java | 7 +- .../PrepareResultInstRepoAssociation.java | 43 +++-- ...arkResultToOrganizationFromIstRepoJob.java | 7 +- .../dhp/bulktag/oozie_app/workflow.xml | 27 ++- .../countrypropagation/oozie_app/workflow.xml | 20 ++- .../oozie_app/workflow.xml | 7 +- .../projecttoresult/oozie_app/workflow.xml | 29 ++-- .../oozie_app/workflow.xml | 29 ++-- .../oozie_app/workflow.xml | 52 +++--- .../raw/AbstractMdRecordToOafMapper.java | 164 +++++++++--------- 17 files changed, 213 insertions(+), 212 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java index 75d85e2bad..1c65e8adec 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.bulktag; +import static eu.dnetlib.dhp.PropagationConstant.removeOutputDir; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.util.Optional; @@ -84,6 +85,7 @@ public class SparkBulkTagJob { conf, isSparkSessionManaged, spark -> { + removeOutputDir(spark, outputPath); execBulkTag(spark, inputPath, outputPath, protoMappingParams, resultClazz, cc); }); } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java index 9dc17701bc..974b3a3b11 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java @@ -69,13 +69,16 @@ public class SparkCountryPropagationJob { runWithSparkSession( conf, isSparkSessionManaged, - spark -> execPropagation( - spark, - sourcePath, - preparedInfoPath, - outputPath, - resultClazz, - saveGraph)); + spark -> { + removeOutputDir(spark, outputPath); + execPropagation( + spark, + sourcePath, + preparedInfoPath, + outputPath, + resultClazz, + saveGraph); + }); } private static void execPropagation( diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java index 3e16b4b4b3..400c8d8efa 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java @@ -74,9 +74,7 @@ public class PrepareResultOrcidAssociationStep1 { conf, isSparkSessionManaged, spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } + removeOutputDir(spark, outputPath); prepareInfo( spark, inputRelationPath, inputResultPath, outputResultPath, resultClazz, allowedsemrel); }); diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java index 65d8811bc7..2cea32e58f 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java @@ -50,9 +50,7 @@ public class PrepareResultOrcidAssociationStep2 { conf, isSparkSessionManaged, spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } + removeOutputDir(spark, outputPath); mergeInfo(spark, inputPath, outputPath); }); } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java index ebb75a5a67..b34b29c481 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java @@ -70,11 +70,10 @@ public class SparkOrcidToResultFromSemRelJob { conf, isSparkSessionManaged, spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } - if (saveGraph) + removeOutputDir(spark, outputPath); + if (saveGraph) { execPropagation(spark, possibleUpdates, inputPath, outputPath, resultClazz); + } }); } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java index 05dcdc6928..c27da42583 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java @@ -60,6 +60,8 @@ public class PrepareProjectResultsAssociation { conf, isSparkSessionManaged, spark -> { + removeOutputDir(spark, potentialUpdatePath); + removeOutputDir(spark, alreadyLinkedPath); prepareResultProjProjectResults( spark, inputPath, diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java index e2d4d56878..90eb54e5fa 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java @@ -55,9 +55,7 @@ public class PrepareResultCommunitySet { conf, isSparkSessionManaged, spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } + removeOutputDir(spark, outputPath); prepareInfo(spark, inputPath, outputPath, organizationMap); }); } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java index 71275cc7f0..66297e1779 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java @@ -68,11 +68,10 @@ public class SparkResultToCommunityFromOrganizationJob { conf, isSparkSessionManaged, spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } - if (saveGraph) + removeOutputDir(spark, outputPath); + if (saveGraph) { execPropagation(spark, inputPath, outputPath, resultClazz, possibleupdatespath); + } }); } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java index f8fe1668fd..5f549be531 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java @@ -58,30 +58,15 @@ public class PrepareResultInstRepoAssociation { isSparkSessionManaged, spark -> { readNeededResources(spark, inputPath); + + removeOutputDir(spark, datasourceOrganizationPath); prepareDatasourceOrganization(spark, datasourceOrganizationPath); + + removeOutputDir(spark, alreadyLinkedPath); prepareAlreadyLinkedAssociation(spark, alreadyLinkedPath); }); } - private static void prepareAlreadyLinkedAssociation( - SparkSession spark, String alreadyLinkedPath) { - String query = "Select source resultId, collect_set(target) organizationSet " - + "from relation " - + "where datainfo.deletedbyinference = false " - + "and relClass = '" - + RELATION_RESULT_ORGANIZATION_REL_CLASS - + "' " - + "group by source"; - - spark - .sql(query) - .as(Encoders.bean(ResultOrganizationSet.class)) - // TODO retry to stick with datasets - .toJavaRDD() - .map(r -> OBJECT_MAPPER.writeValueAsString(r)) - .saveAsTextFile(alreadyLinkedPath, GzipCodec.class); - } - private static void readNeededResources(SparkSession spark, String inputPath) { Dataset datasource = readPath(spark, inputPath + "/datasource", Datasource.class); datasource.createOrReplaceTempView("datasource"); @@ -119,4 +104,24 @@ public class PrepareResultInstRepoAssociation { .option("compression", "gzip") .json(datasourceOrganizationPath); } + + private static void prepareAlreadyLinkedAssociation( + SparkSession spark, String alreadyLinkedPath) { + String query = "Select source resultId, collect_set(target) organizationSet " + + "from relation " + + "where datainfo.deletedbyinference = false " + + "and relClass = '" + + RELATION_RESULT_ORGANIZATION_REL_CLASS + + "' " + + "group by source"; + + spark + .sql(query) + .as(Encoders.bean(ResultOrganizationSet.class)) + // TODO retry to stick with datasets + .toJavaRDD() + .map(r -> OBJECT_MAPPER.writeValueAsString(r)) + .saveAsTextFile(alreadyLinkedPath, GzipCodec.class); + } + } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java index 86634d43fc..13577fa7c1 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java @@ -83,10 +83,8 @@ public class SparkResultToOrganizationFromIstRepoJob { conf, isSparkSessionManaged, spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } - if (saveGraph) + removeOutputDir(spark, outputPath); + if (saveGraph) { execPropagation( spark, datasourceorganization, @@ -94,6 +92,7 @@ public class SparkResultToOrganizationFromIstRepoJob { inputPath, outputPath, resultClazz); + } }); } diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml index 754aba4f29..f019f8413d 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml @@ -18,6 +18,17 @@ + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + @@ -42,8 +53,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/relation ${nameNode}/${outputPath}/relation @@ -53,8 +62,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/organization ${nameNode}/${outputPath}/organization @@ -64,8 +71,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/project ${nameNode}/${outputPath}/project @@ -75,8 +80,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/datasource ${nameNode}/${outputPath}/datasource @@ -95,8 +98,6 @@ - ${jobTracker} - ${nameNode} yarn-cluster cluster bulkTagging-publication @@ -124,8 +125,6 @@ - ${jobTracker} - ${nameNode} yarn-cluster cluster bulkTagging-dataset @@ -153,8 +152,6 @@ - ${jobTracker} - ${nameNode} yarn-cluster cluster bulkTagging-orp @@ -182,8 +179,6 @@ - ${jobTracker} - ${nameNode} yarn-cluster cluster bulkTagging-software diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml index fc877071df..85116e4cc9 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml @@ -19,6 +19,17 @@ + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + @@ -43,8 +54,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/relation ${nameNode}/${outputPath}/relation @@ -54,18 +63,15 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/organization ${nameNode}/${outputPath}/organization + - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/project ${nameNode}/${outputPath}/project @@ -75,8 +81,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/datasource ${nameNode}/${outputPath}/datasource diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml index e4429b710c..5ddc5fedf4 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml @@ -57,6 +57,7 @@ + ${jobTracker} @@ -81,7 +82,6 @@ - @@ -230,8 +230,8 @@ - + @@ -271,6 +271,7 @@ + yarn @@ -302,6 +303,7 @@ + yarn @@ -333,6 +335,7 @@ + yarn diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml index 24e1d3b7fa..9e91c06fb3 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml @@ -14,6 +14,17 @@ + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + @@ -42,8 +53,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/relation ${nameNode}/${outputPath}/relation @@ -53,8 +62,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/publication ${nameNode}/${outputPath}/publication @@ -64,8 +71,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/dataset ${nameNode}/${outputPath}/dataset @@ -75,8 +80,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/otherresearchproduct ${nameNode}/${outputPath}/otherresearchproduct @@ -86,28 +89,24 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/software ${nameNode}/${outputPath}/software + - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/organization ${nameNode}/${outputPath}/organization + - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/project ${nameNode}/${outputPath}/project @@ -117,8 +116,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/datasource ${nameNode}/${outputPath}/datasource diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/workflow.xml index d481cad052..6a329fdc46 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/workflow.xml @@ -14,6 +14,17 @@ + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + @@ -38,8 +49,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/relation ${nameNode}/${outputPath}/relation @@ -49,8 +58,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/organization ${nameNode}/${outputPath}/organization @@ -60,8 +67,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/project ${nameNode}/${outputPath}/project @@ -71,8 +76,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/datasource ${nameNode}/${outputPath}/datasource @@ -101,8 +104,8 @@ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --sourcePath${sourcePath}/relation - --hive_metastore_uris${hive_metastore_uris} --outputPath${workingDir}/preparedInfo/resultCommunityList + --hive_metastore_uris${hive_metastore_uris} --organizationtoresultcommunitymap${organizationtoresultcommunitymap} @@ -136,9 +139,9 @@ --preparedInfoPath${workingDir}/preparedInfo/resultCommunityList --sourcePath${sourcePath}/publication + --outputPath${outputPath}/publication --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication - --outputPath${outputPath}/publication --saveGraph${saveGraph} @@ -165,9 +168,9 @@ --preparedInfoPath${workingDir}/preparedInfo/resultCommunityList --sourcePath${sourcePath}/dataset + --outputPath${outputPath}/dataset --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset - --outputPath${outputPath}/dataset --saveGraph${saveGraph} @@ -194,9 +197,9 @@ --preparedInfoPath${workingDir}/preparedInfo/resultCommunityList --sourcePath${sourcePath}/otherresearchproduct + --outputPath${outputPath}/otherresearchproduct --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --outputPath${outputPath}/otherresearchproduct --saveGraph${saveGraph} @@ -223,9 +226,9 @@ --preparedInfoPath${workingDir}/preparedInfo/resultCommunityList --sourcePath${sourcePath}/software + --outputPath${outputPath}/software --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.Software - --outputPath${outputPath}/software --saveGraph${saveGraph} diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml index a1b7f4ad79..e0563abae8 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml @@ -10,6 +10,17 @@ + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + @@ -38,8 +49,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/relation ${nameNode}/${outputPath}/relation @@ -49,8 +58,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/publication ${nameNode}/${outputPath}/publication @@ -60,8 +67,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/dataset ${nameNode}/${outputPath}/dataset @@ -71,8 +76,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/otherresearchproduct ${nameNode}/${outputPath}/otherresearchproduct @@ -82,8 +85,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/software ${nameNode}/${outputPath}/software @@ -93,8 +94,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/organization ${nameNode}/${outputPath}/organization @@ -104,8 +103,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/project ${nameNode}/${outputPath}/project @@ -115,8 +112,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/datasource ${nameNode}/${outputPath}/datasource @@ -125,6 +120,7 @@ + yarn @@ -176,12 +172,12 @@ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --sourcePath${sourcePath}/publication - --hive_metastore_uris${hive_metastore_uris} - --saveGraph${saveGraph} - --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication --outputPath${outputPath}/relation --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + --hive_metastore_uris${hive_metastore_uris} + --saveGraph${saveGraph} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication @@ -206,12 +202,12 @@ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --sourcePath${sourcePath}/dataset - --hive_metastore_uris${hive_metastore_uris} - --saveGraph${saveGraph} - --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset --outputPath${outputPath}/relation --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + --hive_metastore_uris${hive_metastore_uris} + --saveGraph${saveGraph} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset @@ -236,12 +232,12 @@ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --sourcePath${sourcePath}/otherresearchproduct - --hive_metastore_uris${hive_metastore_uris} - --saveGraph${saveGraph} - --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct --outputPath${outputPath}/relation --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + --hive_metastore_uris${hive_metastore_uris} + --saveGraph${saveGraph} + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct @@ -266,12 +262,12 @@ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --sourcePath${sourcePath}/software - --hive_metastore_uris${hive_metastore_uris} - --saveGraph${saveGraph} - --resultTableNameeu.dnetlib.dhp.schema.oaf.Software --outputPath${outputPath}/relation --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + --hive_metastore_uris${hive_metastore_uris} + --saveGraph${saveGraph} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index be0b910226..b9c4e6c804 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -64,7 +64,7 @@ public abstract class AbstractMdRecordToOafMapper { } protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier( - "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); + "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); protected AbstractMdRecordToOafMapper(final Map code2name) { this.code2name = code2name; @@ -75,20 +75,20 @@ public abstract class AbstractMdRecordToOafMapper { DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); final Document doc = DocumentHelper - .parseText( - xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)); + .parseText( + xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)); final String type = doc.valueOf("//dr:CobjCategory/@type"); final KeyValue collectedFrom = getProvenanceDatasource( - doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); + doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); if (collectedFrom == null) { return null; } final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) - ? collectedFrom - : getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name"); + ? collectedFrom + : getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name"); if (hostedBy == null) { return null; @@ -112,17 +112,17 @@ public abstract class AbstractMdRecordToOafMapper { } return keyValue( - createOpenaireId(10, dsId, true), - dsName); + createOpenaireId(10, dsId, true), + dsName); } protected List createOafs( - final Document doc, - final String type, - final KeyValue collectedFrom, - final KeyValue hostedBy, - final DataInfo info, - final long lastUpdateTimestamp) { + final Document doc, + final String type, + final KeyValue collectedFrom, + final KeyValue hostedBy, + final DataInfo info, + final long lastUpdateTimestamp) { final List oafs = new ArrayList<>(); @@ -179,10 +179,10 @@ public abstract class AbstractMdRecordToOafMapper { } private List addProjectRels( - final Document doc, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { + final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { final List res = new ArrayList<>(); @@ -196,15 +196,15 @@ public abstract class AbstractMdRecordToOafMapper { final String projectId = createOpenaireId(40, originalId, true); res - .add( - getRelation( - docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info, - lastUpdateTimestamp)); + .add( + getRelation( + docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info, + lastUpdateTimestamp)); res - .add( - getRelation( - projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info, - lastUpdateTimestamp)); + .add( + getRelation( + projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info, + lastUpdateTimestamp)); } } @@ -212,7 +212,7 @@ public abstract class AbstractMdRecordToOafMapper { } protected Relation getRelation(String source, String target, String relType, String subRelType, String relClass, - KeyValue collectedFrom, DataInfo info, long lastUpdateTimestamp) { + KeyValue collectedFrom, DataInfo info, long lastUpdateTimestamp) { final Relation rel = new Relation(); rel.setRelType(relType); rel.setSubRelType(subRelType); @@ -226,27 +226,27 @@ public abstract class AbstractMdRecordToOafMapper { } protected abstract List addOtherResultRels( - final Document doc, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp); + final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp); private void populateResultFields( - final Result r, - final Document doc, - final KeyValue collectedFrom, - final KeyValue hostedBy, - final DataInfo info, - final long lastUpdateTimestamp) { + final Result r, + final Document doc, + final KeyValue collectedFrom, + final KeyValue hostedBy, + final DataInfo info, + final long lastUpdateTimestamp) { r.setDataInfo(info); r.setLastupdatetimestamp(lastUpdateTimestamp); r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); r.setCollectedfrom(Arrays.asList(collectedFrom)); r - .setPid( - prepareListStructProps( - doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); + .setPid( + prepareListStructProps( + doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation")); r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES @@ -289,7 +289,7 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); protected abstract List prepareInstances( - Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); + Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); protected abstract List> prepareSources(Document doc, DataInfo info); @@ -314,13 +314,13 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract List prepareAuthors(Document doc, DataInfo info); protected abstract List> prepareOtherResearchProductTools( - Document doc, DataInfo info); + Document doc, DataInfo info); protected abstract List> prepareOtherResearchProductContactGroups( - Document doc, DataInfo info); + Document doc, DataInfo info); protected abstract List> prepareOtherResearchProductContactPersons( - Document doc, DataInfo info); + Document doc, DataInfo info); protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info); @@ -329,7 +329,7 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract List prepareSoftwareLicenses(Document doc, DataInfo info); protected abstract List> prepareSoftwareDocumentationUrls( - Document doc, DataInfo info); + Document doc, DataInfo info); protected abstract List prepareDatasetGeoLocations(Document doc, DataInfo info); @@ -359,37 +359,37 @@ public abstract class AbstractMdRecordToOafMapper { final String edition = n.valueOf("@edition"); if (StringUtils.isNotBlank(name)) { return journal( - name, - issnPrinted, - issnOnline, - issnLinking, - ep, - iss, - sp, - vol, - edition, - null, - null, - info); + name, + issnPrinted, + issnOnline, + issnLinking, + ep, + iss, + sp, + vol, + edition, + null, + null, + info); } } return null; } protected Qualifier prepareQualifier( - final Node node, final String xpath, final String schemeId, final String schemeName) { + final Node node, final String xpath, final String schemeId, final String schemeName) { final String classId = node.valueOf(xpath); final String className = code2name.get(classId); return qualifier(classId, className, schemeId, schemeName); } protected List prepareListStructProps( - final Node node, - final String xpath, - final String xpathClassId, - final String schemeId, - final String schemeName, - final DataInfo info) { + final Node node, + final String xpath, + final String xpathClassId, + final String schemeId, + final String schemeName, + final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; @@ -401,7 +401,7 @@ public abstract class AbstractMdRecordToOafMapper { } protected List prepareListStructProps( - final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) { + final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; @@ -411,19 +411,19 @@ public abstract class AbstractMdRecordToOafMapper { } protected List prepareListStructProps( - final Node node, final String xpath, final DataInfo info) { + final Node node, final String xpath, final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; res - .add( - structuredProperty( - n.getText(), - n.valueOf("@classid"), - n.valueOf("@classname"), - n.valueOf("@schemeid"), - n.valueOf("@schemename"), - info)); + .add( + structuredProperty( + n.getText(), + n.valueOf("@classid"), + n.valueOf("@classname"), + n.valueOf("@schemeid"), + n.valueOf("@schemename"), + info)); } return res; } @@ -450,7 +450,7 @@ public abstract class AbstractMdRecordToOafMapper { if (n == null) { return dataInfo( - false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); + false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); } final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); @@ -464,12 +464,12 @@ public abstract class AbstractMdRecordToOafMapper { final String trust = n.valueOf("./oaf:trust"); return dataInfo( - deletedbyinference, - inferenceprovenance, - inferred, - false, - qualifier(paClassId, paClassName, paSchemeId, paSchemeName), - trust); + deletedbyinference, + inferenceprovenance, + inferred, + false, + qualifier(paClassId, paClassName, paSchemeId, paSchemeName), + trust); } protected Field prepareField(final Node node, final String xpath, final DataInfo info) { @@ -477,7 +477,7 @@ public abstract class AbstractMdRecordToOafMapper { } protected List> prepareListFields( - final Node node, final String xpath, final DataInfo info) { + final Node node, final String xpath, final DataInfo info) { return listFields(info, prepareListString(node, xpath)); } From 8828458acfd38e47b03c1a88335b4b5bc69c9eab Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 14 May 2020 10:34:12 +0200 Subject: [PATCH 04/24] minor changes --- .../PrepareResultOrcidAssociationStep1.java | 33 ++++++++++--------- .../SparkOrcidToResultFromSemRelJob.java | 22 +++++++------ 2 files changed, 29 insertions(+), 26 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java index 3e16b4b4b3..7cd057cf39 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java @@ -97,22 +97,23 @@ public class PrepareResultOrcidAssociationStep1 { Dataset result = readPath(spark, inputResultPath, resultClazz); result.createOrReplaceTempView("result"); - String query = " select target resultId, author authorList" - + " from (select id, collect_set(named_struct('name', name, 'surname', surname, 'fullname', fullname, 'orcid', orcid)) author " - + " from ( " - + " select id, MyT.fullname, MyT.name, MyT.surname, MyP.value orcid " - + " from result " - + " lateral view explode (author) a as MyT " - + " lateral view explode (MyT.pid) p as MyP " - + " where MyP.qualifier.classid = 'ORCID') tmp " - + " group by id) r_t " - + " join (" - + " select source, target " - + " from relation " - + " where datainfo.deletedbyinference = false " - + getConstraintList(" relclass = '", allowedsemrel) - + ") rel_rel " - + " on source = id"; + String query = + "SELECT target resultId, author authorList" + + " FROM (SELECT id, collect_set(named_struct('name', name, 'surname', surname, 'fullname', fullname, 'orcid', orcid)) author " + + " FROM ( " + + " SELECT DISTINCT id, MyT.fullname, MyT.name, MyT.surname, MyP.value orcid " + + " FROM result " + + " LATERAL VIEW EXPLODE (author) a AS MyT " + + " LATERAL VIEW EXPLODE (MyT.pid) p AS MyP " + + " WHERE MyP.qualifier.classid = 'ORCID') tmp " + + " GROUP BY id) r_t " + + " JOIN (" + + " SELECT source, target " + + " FROM relation " + + " WHERE datainfo.deletedbyinference = false " + + getConstraintList(" relclass = '", allowedsemrel) + + " ) rel_rel " + + " ON source = id"; spark .sql(query) .as(Encoders.bean(ResultOrcidList.class)) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java index ebb75a5a67..b93b66d9f9 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java @@ -132,16 +132,16 @@ public class SparkOrcidToResultFromSemRelJob { private static boolean enrichAuthor(AutoritativeAuthor autoritative_author, Author author) { boolean toaddpid = false; - if (StringUtils.isNoneEmpty(autoritative_author.getSurname())) { - if (StringUtils.isNoneEmpty(author.getSurname())) { + if (StringUtils.isNotEmpty(autoritative_author.getSurname())) { + if (StringUtils.isNotEmpty(author.getSurname())) { if (autoritative_author .getSurname() .trim() .equalsIgnoreCase(author.getSurname().trim())) { // have the same surname. Check the name - if (StringUtils.isNoneEmpty(autoritative_author.getName())) { - if (StringUtils.isNoneEmpty(author.getName())) { + if (StringUtils.isNotEmpty(autoritative_author.getName())) { + if (StringUtils.isNotEmpty(author.getName())) { if (autoritative_author .getName() .trim() @@ -150,12 +150,14 @@ public class SparkOrcidToResultFromSemRelJob { } // they could be differently written (i.e. only the initials of the name // in one of the two - if (autoritative_author - .getName() - .trim() - .substring(0, 0) - .equalsIgnoreCase(author.getName().trim().substring(0, 0))) { - toaddpid = true; + else { + if (autoritative_author + .getName() + .trim() + .substring(0, 0) + .equalsIgnoreCase(author.getName().trim().substring(0, 0))) { + toaddpid = true; + } } } } From f044d093156c3c29cf00a4a9b498459885ebcdd0 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 14 May 2020 15:07:24 +0200 Subject: [PATCH 05/24] revised mapping: more accurate mapping for name/surname from datacite format; improved mapping of null values --- .../migration/ProtoConverter.java | 27 ++++++++++++----- .../raw/MigrateDbEntitiesApplication.java | 20 +++++-------- .../dhp/oa/graph/raw/OdfToOafMapper.java | 30 +++++++++++++++---- .../dhp/oa/graph/raw/common/PacePerson.java | 1 - 4 files changed, 51 insertions(+), 27 deletions(-) diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java index 90d573ac07..e55c0eb7b6 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java @@ -523,7 +523,9 @@ public class ProtoConverter implements Serializable { } private static Context mapContext(ResultProtos.Result.Context context) { - + if (context == null || StringUtils.isBlank(context.getId())) { + return null; + } final Context entity = new Context(); entity.setId(context.getId()); entity @@ -537,6 +539,10 @@ public class ProtoConverter implements Serializable { } public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) { + if (kv == null || StringUtils.isBlank(kv.getKey()) & StringUtils.isBlank(kv.getValue())) { + return null; + } + final KeyValue keyValue = new KeyValue(); keyValue.setKey(kv.getKey()); keyValue.setValue(kv.getValue()); @@ -575,6 +581,10 @@ public class ProtoConverter implements Serializable { } public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) { + if (sp == null | StringUtils.isBlank(sp.getValue())) { + return null; + } + final StructuredProperty structuredProperty = new StructuredProperty(); structuredProperty.setValue(sp.getValue()); structuredProperty.setQualifier(mapQualifier(sp.getQualifier())); @@ -611,6 +621,10 @@ public class ProtoConverter implements Serializable { } public static Field mapStringField(FieldTypeProtos.StringField s) { + if (s == null || StringUtils.isBlank(s.getValue())) { + return null; + } + final Field stringField = new Field<>(); stringField.setValue(s.getValue()); stringField.setDataInfo(mapDataInfo(s.getDataInfo())); @@ -618,19 +632,16 @@ public class ProtoConverter implements Serializable { } public static Field mapBoolField(FieldTypeProtos.BoolField b) { + if (b == null) { + return null; + } + final Field booleanField = new Field<>(); booleanField.setValue(b.getValue()); booleanField.setDataInfo(mapDataInfo(b.getDataInfo())); return booleanField; } - public static Field mapIntField(FieldTypeProtos.IntField b) { - final Field entity = new Field<>(); - entity.setValue(b.getValue()); - entity.setDataInfo(mapDataInfo(b.getDataInfo())); - return entity; - } - public static Journal mapJournal(FieldTypeProtos.Journal j) { final Journal journal = new Journal(); journal.setConferencedate(j.getConferencedate()); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index e5e348642c..ebe2b703ba 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -50,8 +50,7 @@ import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -public class MigrateDbEntitiesApplication extends AbstractMigrationApplication - implements Closeable { +public class MigrateDbEntitiesApplication extends AbstractMigrationApplication implements Closeable { private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class); @@ -128,9 +127,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication } public List processDatasource(final ResultSet rs) { - try { - final DataInfo info = prepareDataInfo(rs); final Datasource ds = new Datasource(); @@ -194,7 +191,6 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication public List processProject(final ResultSet rs) { try { - final DataInfo info = prepareDataInfo(rs); final Project p = new Project(); @@ -249,9 +245,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication } public List processOrganization(final ResultSet rs) { - try { - final DataInfo info = prepareDataInfo(rs); final Organization o = new Organization(); @@ -370,14 +364,12 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication final DataInfo info = dataInfo( false, null, false, false, - qualifier(USER_CLAIM, USER_CLAIM, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), "0.9"); final List collectedFrom = listKeyValues( createOpenaireId(10, "infrastruct_::openaire", true), "OpenAIRE"); try { - if (rs.getString(SOURCE_TYPE).equals("context")) { final Result r; @@ -461,9 +453,12 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication final Boolean inferred = rs.getBoolean("inferred"); final String trust = rs.getString("trust"); return dataInfo( - - deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, trust); - + deletedbyinference, + inferenceprovenance, + inferred, + false, + ENTITYREGISTRY_PROVENANCE_ACTION, + trust); } private Qualifier prepareQualifierSplitting(final String s) { @@ -535,4 +530,5 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication super.close(); dbClient.close(); } + } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 04984d0086..5baac12fd0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -12,6 +12,7 @@ import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.Node; +import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.DataInfo; @@ -44,9 +45,24 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { for (final Object o : doc.selectNodes("//datacite:creator")) { final Node n = (Node) o; final Author author = new Author(); - author.setFullname(n.valueOf("./datacite:creatorName")); - author.setName(n.valueOf("./datacite:givenName")); - author.setSurname(n.valueOf("./datacite:familyName")); + final String fullname = n.valueOf("./datacite:creatorName"); + author.setFullname(fullname); + + PacePerson pp = new PacePerson(fullname, false); + final String name = n.valueOf("./datacite:givenName"); + if (StringUtils.isBlank(name) & pp.isAccurate()) { + author.setName(pp.getNormalisedFirstName()); + } else { + author.setName(name); + } + + final String surname = n.valueOf("./datacite:familyName"); + if (StringUtils.isBlank(surname) & pp.isAccurate()) { + author.setSurname(pp.getNormalisedSurname()); + } else { + author.setSurname(surname); + } + author.setAffiliation(prepareListFields(doc, "./datacite:affiliation", info)); author.setPid(preparePids(doc, info)); author.setRank(pos++); @@ -77,8 +93,6 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final KeyValue hostedby) { final Instance instance = new Instance(); - final Set url = new HashSet<>(); - instance.setUrl(new ArrayList<>()); instance .setInstancetype( prepareQualifier( @@ -97,6 +111,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { .setProcessingchargecurrency( field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); + final Set url = new HashSet<>(); for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) { url.add(((Node) o).getText().trim()); } @@ -109,7 +124,10 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { for (final Object o : doc.selectNodes("//datacite:identifier[@identifierType='DOI']")) { url.add(HTTP_DX_DOI_PREIFX + ((Node) o).getText().trim()); } - instance.getUrl().addAll(url); + if (!url.isEmpty()) { + instance.setUrl(new ArrayList<>()); + instance.getUrl().addAll(url); + } return Arrays.asList(instance); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java index d1c615dcdd..6e474f2f38 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java @@ -1,7 +1,6 @@ package eu.dnetlib.dhp.oa.graph.raw.common; -import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.text.Normalizer; import java.util.HashSet; From 42085e8d99220ccdf1f4c9cc38d26db24b9544a2 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 14 May 2020 18:22:28 +0200 Subject: [PATCH 06/24] added some constants --- .../java/eu/dnetlib/dhp/schema/common/ModelConstants.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java index accc06d122..e32dd10fa9 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java @@ -13,6 +13,7 @@ public class ModelConstants { public static final String DNET_DATA_CITE_DATE = "dnet:dataCite_date"; public static final String DNET_DATA_CITE_RESOURCE = "dnet:dataCite_resource"; public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions"; + public static final String DNET_COUNTRY_TYPE = "dnet:countries"; public static final String SYSIMPORT_CROSSWALK_REPOSITORY = "sysimport:crosswalk:repository"; public static final String SYSIMPORT_CROSSWALK_ENTITYREGISTRY = "sysimport:crosswalk:entityregistry"; @@ -49,6 +50,13 @@ public class ModelConstants { public static final String HAS_PARTICIPANT = "hasParticipant"; public static final String IS_PARTICIPANT = "isParticipant"; + public static final String RESULT_ORGANIZATION = "resultOrganization"; + public static final String AFFILIATION = "affiliation"; + public static final String IS_AUTHOR_INSTITUTION_OF = "isAuthorInstitutionOf"; + public static final String HAS_AUTHOR_INSTITUTION = "hasAuthorInstitution"; + + public static final String MERGES = "merges"; + public static final String UNKNOWN = "UNKNOWN"; public static final String NOT_AVAILABLE = "not available"; From d05630d9795f1a26a9ea9ce33d2337c9156217ff Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 14 May 2020 18:22:50 +0200 Subject: [PATCH 07/24] removed the constants added in ModelConstants --- .../eu/dnetlib/dhp/PropagationConstant.java | 30 ++++--------------- 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java index 8d2fede82e..13ed46508a 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java @@ -4,6 +4,8 @@ package eu.dnetlib.dhp; import java.util.List; import java.util.Optional; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.common.ModelSupport; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -24,10 +26,6 @@ public class PropagationConstant { public static final String TRUE = "true"; - public static final String DNET_COUNTRY_SCHEMA = "dnet:countries"; - public static final String DNET_SCHEMA_NAME = "dnet:provenanceActions"; - public static final String DNET_SCHEMA_ID = "dnet:provenanceActions"; - public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_ID = "country:instrepos"; public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME = "Propagation of country to result collected from datasources of type institutional repositories"; @@ -46,22 +44,6 @@ public class PropagationConstant { public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID = "authorpid:result"; public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME = "Propagation of authors pid to result through semantic relations"; - public static final String RELATION_DATASOURCE_ORGANIZATION_REL_CLASS = "isProvidedBy"; - - public static final String RELATION_RESULTORGANIZATION_REL_TYPE = "resultOrganization"; - public static final String RELATION_RESULTORGANIZATION_SUBREL_TYPE = "affiliation"; - public static final String RELATION_ORGANIZATION_RESULT_REL_CLASS = "isAuthorInstitutionOf"; - public static final String RELATION_RESULT_ORGANIZATION_REL_CLASS = "hasAuthorInstitution"; - - public static final String RELATION_RESULTRESULT_REL_TYPE = "resultResult"; - - public static final String RELATION_RESULTPROJECT_REL_TYPE = "resultProject"; - public static final String RELATION_RESULTPROJECT_SUBREL_TYPE = "outcome"; - public static final String RELATION_RESULT_PROJECT_REL_CLASS = "isProducedBy"; - public static final String RELATION_PROJECT_RESULT_REL_CLASS = "produces"; - - public static final String RELATION_REPRESENTATIVERESULT_RESULT_CLASS = "merges"; - public static final String PROPAGATION_AUTHOR_PID = "ORCID"; public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -76,8 +58,8 @@ public class PropagationConstant { Country nc = new Country(); nc.setClassid(classid); nc.setClassname(classname); - nc.setSchemename(DNET_COUNTRY_SCHEMA); - nc.setSchemeid(DNET_COUNTRY_SCHEMA); + nc.setSchemename(ModelConstants.DNET_COUNTRY_TYPE); + nc.setSchemeid(ModelConstants.DNET_COUNTRY_TYPE); nc .setDataInfo( getDataInfo( @@ -102,8 +84,8 @@ public class PropagationConstant { Qualifier pa = new Qualifier(); pa.setClassid(inference_class_id); pa.setClassname(inference_class_name); - pa.setSchemeid(DNET_SCHEMA_ID); - pa.setSchemename(DNET_SCHEMA_NAME); + pa.setSchemeid(ModelConstants.DNET_PID_TYPES); + pa.setSchemename(ModelConstants.DNET_PID_TYPES); return pa; } From f25db01664eb56d2250d00e95822b4aaacaf52bf Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 14 May 2020 18:29:24 +0200 Subject: [PATCH 08/24] changed in the constant from propagationconstants to modelconstants --- .../PrepareDatasourceCountryAssociation.java | 3 ++- .../PrepareProjectResultsAssociation.java | 3 ++- .../SparkResultToProjectThroughSemRelJob.java | 13 +++++++------ .../PrepareResultCommunitySet.java | 5 +++-- .../PrepareResultInstRepoAssociation.java | 5 +++-- .../SparkResultToOrganizationFromIstRepoJob.java | 13 +++++++------ 6 files changed, 24 insertions(+), 18 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java index e91a1e48a7..1d01d35e5f 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java @@ -7,6 +7,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.util.Arrays; import java.util.List; +import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; @@ -100,7 +101,7 @@ public class PrepareDatasourceCountryAssociation { + "JOIN ( SELECT source, target " + " FROM relation " + " WHERE relclass = '" - + RELATION_DATASOURCE_ORGANIZATION_REL_CLASS + + ModelConstants.IS_PROVIDED_BY + "' " + " AND datainfo.deletedbyinference = false ) rel " + "ON d.id = rel.source " diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java index c27da42583..920ef1f06d 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java @@ -8,6 +8,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.util.Arrays; import java.util.List; +import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; @@ -85,7 +86,7 @@ public class PrepareProjectResultsAssociation { + " FROM relation " + " WHERE datainfo.deletedbyinference = false " + " AND relClass = '" - + RELATION_RESULT_PROJECT_REL_CLASS + + ModelConstants.IS_PRODUCED_BY + "'"; Dataset resproj_relation = spark.sql(resproj_relation_query); diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java index 36694b3dd5..44a439ab78 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java @@ -9,6 +9,7 @@ import java.util.Iterator; import java.util.List; import java.util.Optional; +import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FlatMapFunction; @@ -122,9 +123,9 @@ public class SparkResultToProjectThroughSemRelJob { getRelation( resId, projectId, - RELATION_RESULT_PROJECT_REL_CLASS, - RELATION_RESULTPROJECT_REL_TYPE, - RELATION_RESULTPROJECT_SUBREL_TYPE, + ModelConstants.IS_PRODUCED_BY, + ModelConstants.RESULT_PROJECT , + ModelConstants.OUTCOME, PROPAGATION_DATA_INFO_TYPE, PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID, PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME)); @@ -133,9 +134,9 @@ public class SparkResultToProjectThroughSemRelJob { getRelation( projectId, resId, - RELATION_PROJECT_RESULT_REL_CLASS, - RELATION_RESULTPROJECT_REL_TYPE, - RELATION_RESULTPROJECT_SUBREL_TYPE, + ModelConstants.PRODUCES, + ModelConstants.RESULT_PROJECT, + ModelConstants.OUTCOME, PROPAGATION_DATA_INFO_TYPE, PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID, PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME)); diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java index 90eb54e5fa..fc9f395d1b 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java @@ -6,6 +6,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.util.*; +import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -74,13 +75,13 @@ public class PrepareResultCommunitySet { + " FROM relation " + " WHERE datainfo.deletedbyinference = false " + " AND relClass = '" - + RELATION_RESULT_ORGANIZATION_REL_CLASS + + ModelConstants.HAS_AUTHOR_INSTITUTION + "') result_organization " + "LEFT JOIN (SELECT source, collect_set(target) org_set " + " FROM relation " + " WHERE datainfo.deletedbyinference = false " + " AND relClass = '" - + RELATION_REPRESENTATIVERESULT_RESULT_CLASS + + ModelConstants.MERGES + "' " + " GROUP BY source) organization_organization " + "ON result_organization.target = organization_organization.source "; diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java index 5f549be531..0a83e4195c 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java @@ -4,6 +4,7 @@ package eu.dnetlib.dhp.resulttoorganizationfrominstrepo; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; +import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; @@ -91,7 +92,7 @@ public class PrepareResultInstRepoAssociation { + "JOIN ( SELECT source, target " + "FROM relation " + "WHERE relclass = '" - + RELATION_DATASOURCE_ORGANIZATION_REL_CLASS + + ModelConstants.IS_PROVIDED_BY + "' " + "AND datainfo.deletedbyinference = false ) rel " + "ON d.id = rel.source "; @@ -111,7 +112,7 @@ public class PrepareResultInstRepoAssociation { + "from relation " + "where datainfo.deletedbyinference = false " + "and relClass = '" - + RELATION_RESULT_ORGANIZATION_REL_CLASS + + ModelConstants.HAS_AUTHOR_INSTITUTION + "' " + "group by source"; diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java index 13577fa7c1..fc9a46d725 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java @@ -6,6 +6,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.util.*; +import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; @@ -150,9 +151,9 @@ public class SparkResultToOrganizationFromIstRepoJob { getRelation( orgId, resultId, - RELATION_ORGANIZATION_RESULT_REL_CLASS, - RELATION_RESULTORGANIZATION_REL_TYPE, - RELATION_RESULTORGANIZATION_SUBREL_TYPE, + ModelConstants.IS_AUTHOR_INSTITUTION_OF, + ModelConstants.RESULT_ORGANIZATION, + ModelConstants.AFFILIATION, PROPAGATION_DATA_INFO_TYPE, PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID, PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME)); @@ -161,9 +162,9 @@ public class SparkResultToOrganizationFromIstRepoJob { getRelation( resultId, orgId, - RELATION_RESULT_ORGANIZATION_REL_CLASS, - RELATION_RESULTORGANIZATION_REL_TYPE, - RELATION_RESULTORGANIZATION_SUBREL_TYPE, + ModelConstants.HAS_AUTHOR_INSTITUTION, + ModelConstants.RESULT_ORGANIZATION, + ModelConstants.AFFILIATION, PROPAGATION_DATA_INFO_TYPE, PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID, PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME)); From eb64335a547c1a8b9e708f50b6f33b362fa1e54e Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 09:05:26 +0200 Subject: [PATCH 09/24] parallel implementation for graph Hive importer --- .../graph/hive/GraphHiveTableImporterJob.java | 79 +++++++ .../hive/oozie_app/lib/scripts/reset_db.sql | 2 + .../dhp/oa/graph/hive/oozie_app/workflow.xml | 200 +++++++++++++++++- .../oa/graph/hive_db_importer_parameters.json | 26 +++ .../graph/hive_table_importer_parameters.json | 32 +++ 5 files changed, 331 insertions(+), 8 deletions(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/reset_db.sql create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_db_importer_parameters.json create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_table_importer_parameters.json diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java new file mode 100644 index 0000000000..f88f7457f0 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java @@ -0,0 +1,79 @@ + +package eu.dnetlib.dhp.oa.graph.hive; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; +import static eu.dnetlib.dhp.schema.common.ModelSupport.tableIdentifier; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Oaf; + +public class GraphHiveTableImporterJob { + + private static final Logger log = LoggerFactory.getLogger(GraphHiveTableImporterJob.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(String[] args) throws Exception { + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GraphHiveTableImporterJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/hive_table_importer_parameters.json"))); + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String inputPath = parser.get("inputPath"); + log.info("inputPath: {}", inputPath); + + String hiveDbName = parser.get("hiveDbName"); + log.info("hiveDbName: {}", hiveDbName); + + final String className = parser.get("className"); + log.info("className: {}", className); + + Class clazz = (Class) Class.forName(className); + + String hiveMetastoreUris = parser.get("hiveMetastoreUris"); + log.info("hiveMetastoreUris: {}", hiveMetastoreUris); + + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", hiveMetastoreUris); + + runWithSparkHiveSession( + conf, isSparkSessionManaged, spark -> loadGraphTable(spark, inputPath, hiveDbName, clazz)); + } + + // protected for testing + private static void loadGraphTable(SparkSession spark, String inputPath, String hiveDbName, + Class clazz) { + + spark + .read() + .textFile(inputPath) + .map((MapFunction) s -> OBJECT_MAPPER.readValue(s, clazz), Encoders.bean(clazz)) + .write() + .mode(SaveMode.Overwrite) + .saveAsTable(tableIdentifier(hiveDbName, clazz)); + } + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/reset_db.sql b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/reset_db.sql new file mode 100644 index 0000000000..484afde804 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/reset_db.sql @@ -0,0 +1,2 @@ +DROP DATABASE IF EXISTS ${hiveDbName} CASCADE; +CREATE DATABASE ${hiveDbName}; \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml index e837ac6b31..2bcbbba5eb 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml @@ -72,18 +72,44 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + + + + + hive.metastore.uris + ${hiveMetastoreUris} + + + ${hiveJdbcUrl}/${hiveDbName} + + hiveDbName=${hiveDbName} + + + + + + + + + + + + + + + + yarn cluster - MapGraphAsHiveDB - eu.dnetlib.dhp.oa.graph.hive.GraphHiveImporterJob + Import table publication + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob dhp-graph-mapper-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} @@ -95,18 +121,175 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --inputPath${inputPath} + --inputPath${inputPath}/publication --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.Publication --hiveMetastoreUris${hiveMetastoreUris} - + + + + yarn + cluster + Import table dataset + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${inputPath}/dataset + --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.Dataset + --hiveMetastoreUris${hiveMetastoreUris} + + + + + + + + yarn + cluster + Import table otherresearchproduct + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${inputPath}/otherresearchproduct + --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --hiveMetastoreUris${hiveMetastoreUris} + + + + + + + + yarn + cluster + Import table software + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${inputPath}/software + --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.Software + --hiveMetastoreUris${hiveMetastoreUris} + + + + + + + + yarn + cluster + Import table datasource + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${inputPath}/datasource + --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.Datasource + --hiveMetastoreUris${hiveMetastoreUris} + + + + + + + + yarn + cluster + Import table organization + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${inputPath}/organization + --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.Organization + --hiveMetastoreUris${hiveMetastoreUris} + + + + + + + + yarn + cluster + Import table project + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${inputPath}/project + --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.Project + --hiveMetastoreUris${hiveMetastoreUris} + + + + + + + - ${jobTracker} - ${nameNode} hive.metastore.uris @@ -122,4 +305,5 @@ + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_db_importer_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_db_importer_parameters.json new file mode 100644 index 0000000000..d6c13773ae --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_db_importer_parameters.json @@ -0,0 +1,26 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "in", + "paramLongName": "inputPath", + "paramDescription": "the path to the graph data dump to read", + "paramRequired": true + }, + { + "paramName": "hmu", + "paramLongName": "hiveMetastoreUris", + "paramDescription": "the hive metastore uris", + "paramRequired": true + }, + { + "paramName": "db", + "paramLongName": "hiveDbName", + "paramDescription": "the target hive database name", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_table_importer_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_table_importer_parameters.json new file mode 100644 index 0000000000..5b5b0743c7 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_table_importer_parameters.json @@ -0,0 +1,32 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "in", + "paramLongName": "inputPath", + "paramDescription": "the path to the graph data dump to read", + "paramRequired": true + }, + { + "paramName": "hmu", + "paramLongName": "hiveMetastoreUris", + "paramDescription": "the hive metastore uris", + "paramRequired": true + }, + { + "paramName": "db", + "paramLongName": "hiveDbName", + "paramDescription": "the target hive database name", + "paramRequired": true + }, + { + "paramName": "tn", + "paramLongName": "className", + "paramDescription": "the class modelling the target table", + "paramRequired": true + } +] \ No newline at end of file From fd62359538d397cfd9e73a3602e38638a5a0ff1d Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 09:28:15 +0200 Subject: [PATCH 10/24] cleanup --- .../graph/migrate_actionsets_parameters.json | 10 ---------- .../eu/dnetlib/dhp/oa/graph/scriptFile.scala | 0 .../transform_actionsets_parameters.json | 20 ------------------- 3 files changed, 30 deletions(-) delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_actionsets_parameters.json create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/scriptFile.scala delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/transform_actionsets_parameters.json diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_actionsets_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_actionsets_parameters.json deleted file mode 100644 index c4910ec61b..0000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_actionsets_parameters.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - {"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true}, - {"paramName":"sn", "paramLongName":"sourceNameNode", "paramDescription": "nameNode of the source cluster", "paramRequired": true}, - {"paramName":"tn", "paramLongName":"targetNameNode", "paramDescription": "namoNode of the target cluster", "paramRequired": true}, - {"paramName":"w", "paramLongName":"workingDirectory", "paramDescription": "working directory", "paramRequired": true}, - {"paramName":"nm", "paramLongName":"distcp_num_maps", "paramDescription": "maximum number of map tasks used in the distcp process", "paramRequired": true}, - {"paramName":"mm", "paramLongName":"distcp_memory_mb", "paramDescription": "memory for distcp action copying actionsets from remote cluster", "paramRequired": true}, - {"paramName":"tt", "paramLongName":"distcp_task_timeout", "paramDescription": "timeout for distcp copying actions from remote cluster", "paramRequired": true}, - {"paramName":"tr", "paramLongName":"transform_only", "paramDescription": "activate tranform-only mode. Only apply transformation step", "paramRequired": true} -] diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/scriptFile.scala b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/scriptFile.scala new file mode 100644 index 0000000000..e69de29bb2 diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/transform_actionsets_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/transform_actionsets_parameters.json deleted file mode 100644 index 6fa10f7399..0000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/transform_actionsets_parameters.json +++ /dev/null @@ -1,20 +0,0 @@ -[ - { - "paramName": "mt", - "paramLongName": "master", - "paramDescription": "should be local or yarn", - "paramRequired": true - }, - { - "paramName": "is", - "paramLongName": "isLookupUrl", - "paramDescription": "URL of the isLookUp Service", - "paramRequired": true - }, - { - "paramName": "i", - "paramLongName": "inputPaths", - "paramDescription": "URL of the isLookUp Service", - "paramRequired": true - } -] From 9d028ffe1c91c270482c75d5860e9f0222c01341 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 09:28:55 +0200 Subject: [PATCH 11/24] cleanup --- .../src/main/resources/eu/dnetlib/dhp/oa/graph/scriptFile.scala | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/scriptFile.scala diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/scriptFile.scala b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/scriptFile.scala deleted file mode 100644 index e69de29bb2..0000000000 From 18f46e47b915a3770e584765bce4e4d281f3668f Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 09:34:48 +0200 Subject: [PATCH 12/24] added relations to the graph2hive import workflow --- .../dhp/oa/graph/hive/oozie_app/workflow.xml | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml index 2bcbbba5eb..8566d76670 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml @@ -102,6 +102,7 @@ + @@ -286,6 +287,32 @@ + + + yarn + cluster + Import table project + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${inputPath}/relation + --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.Relation + --hiveMetastoreUris${hiveMetastoreUris} + + + + + From 50d6a2ad3c2f6fe19a52ff279749bd5633917fd6 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 09:53:37 +0200 Subject: [PATCH 13/24] added output directory removal in the blacklist spark actions; included common global properties in blacklist's workflow.xml --- .../blacklist/PrepareMergedRelationJob.java | 7 ++++ .../SparkRemoveBlacklistedRelationJob.java | 7 +++- .../dhp/blacklist/oozie_app/workflow.xml | 37 +++++++++++-------- 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java index 0ef59e8c2b..2a46043e20 100644 --- a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java +++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java @@ -5,6 +5,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.util.Optional; +import eu.dnetlib.dhp.common.HdfsSupport; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -56,6 +57,7 @@ public class PrepareMergedRelationJob { conf, isSparkSessionManaged, spark -> { + removeOutputDir(spark, outputPath); selectMergesRelations( spark, inputPath, @@ -84,4 +86,9 @@ public class PrepareMergedRelationJob { (MapFunction) value -> OBJECT_MAPPER.readValue(value, Relation.class), Encoders.bean(Relation.class)); } + + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } + } diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java index 86587bfc90..fe4310217c 100644 --- a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java +++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java @@ -6,6 +6,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.util.Objects; import java.util.Optional; +import eu.dnetlib.dhp.common.HdfsSupport; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -62,6 +63,7 @@ public class SparkRemoveBlacklistedRelationJob { conf, isSparkSessionManaged, spark -> { + removeOutputDir(spark, outputPath); removeBlacklistedRelations( spark, blacklistPath, @@ -69,7 +71,6 @@ public class SparkRemoveBlacklistedRelationJob { outputPath, mergesPath); }); - } private static void removeBlacklistedRelations(SparkSession spark, String blacklistPath, String inputPath, @@ -144,4 +145,8 @@ public class SparkRemoveBlacklistedRelationJob { Encoders.bean(Relation.class)); } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } + } diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml index 1538318c14..dd7827da4e 100644 --- a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml @@ -22,6 +22,25 @@ + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + @@ -49,8 +68,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/publication ${nameNode}/${outputPath}/publication @@ -60,8 +77,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/dataset ${nameNode}/${outputPath}/dataset @@ -71,8 +86,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/otherresearchproduct ${nameNode}/${outputPath}/otherresearchproduct @@ -82,8 +95,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/software ${nameNode}/${outputPath}/software @@ -93,8 +104,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/organization ${nameNode}/${outputPath}/organization @@ -104,8 +113,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/project ${nameNode}/${outputPath}/project @@ -115,8 +122,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/datasource ${nameNode}/${outputPath}/datasource @@ -128,8 +133,6 @@ - ${jobTracker} - ${nameNode} eu.dnetlib.dhp.blacklist.ReadBlacklistFromDB --hdfsPath${workingDir}/blacklist --hdfsNameNode${nameNode} @@ -156,6 +159,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 --sourcePath${sourcePath}/relation --outputPath${workingDir}/mergesRelation @@ -180,6 +184,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 --sourcePath${sourcePath}/relation --outputPath${outputPath}/relation From b7e198475a7fdfacf5cdc3ea39d35f58af657d0c Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 10:20:07 +0200 Subject: [PATCH 14/24] added common methods to create HiveDB table identifiers --- .../dhp/schema/common/ModelSupport.java | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java index fc85b1ac14..9ee7c2debc 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java @@ -1,10 +1,15 @@ package eu.dnetlib.dhp.schema.common; +import static com.google.common.base.Preconditions.checkArgument; + import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.function.Function; +import org.apache.commons.lang3.StringUtils; + import com.google.common.collect.Maps; import eu.dnetlib.dhp.schema.oaf.*; @@ -379,6 +384,21 @@ public class ModelSupport { entityMapping.get(EntityType.valueOf(targetType)).name()); } + public static String tableIdentifier(String dbName, String tableName) { + + checkArgument(StringUtils.isNotBlank(dbName), "DB name cannot be empty"); + checkArgument(StringUtils.isNotBlank(tableName), "table name cannot be empty"); + + return String.format("%s.%s", dbName, tableName); + } + + public static String tableIdentifier(String dbName, Class clazz) { + + checkArgument(Objects.nonNull(clazz), "clazz is needed to derive the table name, thus cannot be null"); + + return tableIdentifier(dbName, clazz.getSimpleName().toLowerCase()); + } + public static Function idFn() { return x -> { if (isSubClass(x, Relation.class)) { From a83265829608c6a318e0a35cbdc7abc95dd9d1b6 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 10:21:09 +0200 Subject: [PATCH 15/24] code formatting --- .../dhp/blacklist/PrepareMergedRelationJob.java | 2 +- .../blacklist/SparkRemoveBlacklistedRelationJob.java | 2 +- .../java/eu/dnetlib/dhp/PropagationConstant.java | 4 ++-- .../PrepareDatasourceCountryAssociation.java | 2 +- .../PrepareResultOrcidAssociationStep1.java | 5 ++--- .../SparkOrcidToResultFromSemRelJob.java | 8 ++++---- .../PrepareProjectResultsAssociation.java | 2 +- .../SparkResultToProjectThroughSemRelJob.java | 6 +++--- .../PrepareResultCommunitySet.java | 2 +- .../PrepareResultInstRepoAssociation.java | 2 +- .../SparkResultToOrganizationFromIstRepoJob.java | 8 ++++---- .../oa/graph/raw/MigrateDbEntitiesApplication.java | 12 ++++++------ 12 files changed, 27 insertions(+), 28 deletions(-) diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java index 2a46043e20..b4bcc509ee 100644 --- a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java +++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java @@ -5,7 +5,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.util.Optional; -import eu.dnetlib.dhp.common.HdfsSupport; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -19,6 +18,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.oaf.Relation; public class PrepareMergedRelationJob { diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java index fe4310217c..92289ec2d3 100644 --- a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java +++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java @@ -6,7 +6,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.util.Objects; import java.util.Optional; -import eu.dnetlib.dhp.common.HdfsSupport; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -19,6 +18,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.oaf.Relation; import scala.Tuple2; diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java index 13ed46508a..c8eb017c71 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java @@ -4,8 +4,6 @@ package eu.dnetlib.dhp; import java.util.List; import java.util.Optional; -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.common.ModelSupport; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -17,6 +15,8 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; public class PropagationConstant { diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java index 1d01d35e5f..98b573102a 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java @@ -7,7 +7,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.util.Arrays; import java.util.List; -import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; @@ -22,6 +21,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; /** diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java index 6549d1ed29..b15f813acd 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java @@ -95,8 +95,7 @@ public class PrepareResultOrcidAssociationStep1 { Dataset result = readPath(spark, inputResultPath, resultClazz); result.createOrReplaceTempView("result"); - String query = - "SELECT target resultId, author authorList" + String query = "SELECT target resultId, author authorList" + " FROM (SELECT id, collect_set(named_struct('name', name, 'surname', surname, 'fullname', fullname, 'orcid', orcid)) author " + " FROM ( " + " SELECT DISTINCT id, MyT.fullname, MyT.name, MyT.surname, MyP.value orcid " @@ -109,7 +108,7 @@ public class PrepareResultOrcidAssociationStep1 { + " SELECT source, target " + " FROM relation " + " WHERE datainfo.deletedbyinference = false " - + getConstraintList(" relclass = '", allowedsemrel) + + getConstraintList(" relclass = '", allowedsemrel) + " ) rel_rel " + " ON source = id"; spark diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java index fd1de3282d..bea847ca76 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java @@ -151,10 +151,10 @@ public class SparkOrcidToResultFromSemRelJob { // in one of the two else { if (autoritative_author - .getName() - .trim() - .substring(0, 0) - .equalsIgnoreCase(author.getName().trim().substring(0, 0))) { + .getName() + .trim() + .substring(0, 0) + .equalsIgnoreCase(author.getName().trim().substring(0, 0))) { toaddpid = true; } } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java index 920ef1f06d..4cd7f88dff 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java @@ -8,7 +8,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.util.Arrays; import java.util.List; -import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; @@ -22,6 +21,7 @@ import com.google.gson.Gson; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Relation; public class PrepareProjectResultsAssociation { diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java index 44a439ab78..1f6264c186 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java @@ -9,7 +9,6 @@ import java.util.Iterator; import java.util.List; import java.util.Optional; -import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FlatMapFunction; @@ -21,6 +20,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Relation; import scala.Tuple2; @@ -124,7 +124,7 @@ public class SparkResultToProjectThroughSemRelJob { resId, projectId, ModelConstants.IS_PRODUCED_BY, - ModelConstants.RESULT_PROJECT , + ModelConstants.RESULT_PROJECT, ModelConstants.OUTCOME, PROPAGATION_DATA_INFO_TYPE, PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID, @@ -135,7 +135,7 @@ public class SparkResultToProjectThroughSemRelJob { projectId, resId, ModelConstants.PRODUCES, - ModelConstants.RESULT_PROJECT, + ModelConstants.RESULT_PROJECT, ModelConstants.OUTCOME, PROPAGATION_DATA_INFO_TYPE, PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID, diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java index fc9f395d1b..5574aad753 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java @@ -6,7 +6,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.util.*; -import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -18,6 +17,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.google.gson.Gson; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Relation; public class PrepareResultCommunitySet { diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java index 0a83e4195c..84e40fa88b 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java @@ -4,7 +4,6 @@ package eu.dnetlib.dhp.resulttoorganizationfrominstrepo; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; -import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; @@ -18,6 +17,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Datasource; import eu.dnetlib.dhp.schema.oaf.Organization; import eu.dnetlib.dhp.schema.oaf.Relation; diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java index fc9a46d725..0ce741b873 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java @@ -6,7 +6,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.util.*; -import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; @@ -20,6 +19,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; @@ -151,7 +151,7 @@ public class SparkResultToOrganizationFromIstRepoJob { getRelation( orgId, resultId, - ModelConstants.IS_AUTHOR_INSTITUTION_OF, + ModelConstants.IS_AUTHOR_INSTITUTION_OF, ModelConstants.RESULT_ORGANIZATION, ModelConstants.AFFILIATION, PROPAGATION_DATA_INFO_TYPE, @@ -163,8 +163,8 @@ public class SparkResultToOrganizationFromIstRepoJob { resultId, orgId, ModelConstants.HAS_AUTHOR_INSTITUTION, - ModelConstants.RESULT_ORGANIZATION, - ModelConstants.AFFILIATION, + ModelConstants.RESULT_ORGANIZATION, + ModelConstants.AFFILIATION, PROPAGATION_DATA_INFO_TYPE, PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID, PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME)); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index ebe2b703ba..5b8296c19b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -453,12 +453,12 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i final Boolean inferred = rs.getBoolean("inferred"); final String trust = rs.getString("trust"); return dataInfo( - deletedbyinference, - inferenceprovenance, - inferred, - false, - ENTITYREGISTRY_PROVENANCE_ACTION, - trust); + deletedbyinference, + inferenceprovenance, + inferred, + false, + ENTITYREGISTRY_PROVENANCE_ACTION, + trust); } private Qualifier prepareQualifierSplitting(final String s) { From 2a4e68a292d98e261527ab267c054cec9ae3aad9 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Fri, 15 May 2020 12:25:37 +0200 Subject: [PATCH 16/24] events recognition --- .../broker/oa/GenerateEventsApplication.java | 22 ++++---- .../EnrichMissingAbstract.java | 3 +- .../EnrichMissingAuthorOrcid.java | 3 +- .../oa/matchers/EnrichMissingOpenAccess.java | 55 +++++++++++++++++++ .../{util => matchers}/EnrichMissingPid.java | 19 ++++++- .../EnrichMissingProject.java | 6 +- .../EnrichMissingPublicationDate.java | 3 +- .../oa/matchers/EnrichMissingSubject.java | 53 ++++++++++++++++++ .../oa/matchers/EnrichMoreOpenAccess.java | 53 ++++++++++++++++++ .../oa/{util => matchers}/EnrichMorePid.java | 22 ++++++-- .../{util => matchers}/EnrichMoreSubject.java | 28 +++++++--- .../oa/{util => matchers}/UpdateMatcher.java | 3 +- .../dhp/broker/oa/util/BrokerConstants.java | 7 +++ .../dhp/broker/oa/util/ConversionUtils.java | 36 ++++++++++++ .../oa/util/EnrichMissingOpenAccess.java | 33 ----------- .../broker/oa/util/EnrichMissingSubject.java | 40 -------------- .../broker/oa/util/EnrichMoreOpenAccess.java | 33 ----------- .../dhp/broker/oa/util/UpdateInfo.java | 2 +- 18 files changed, 281 insertions(+), 140 deletions(-) rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/{util => matchers}/EnrichMissingAbstract.java (90%) rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/{util => matchers}/EnrichMissingAuthorOrcid.java (91%) create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingOpenAccess.java rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/{util => matchers}/EnrichMissingPid.java (60%) rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/{util => matchers}/EnrichMissingProject.java (86%) rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/{util => matchers}/EnrichMissingPublicationDate.java (89%) create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSubject.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreOpenAccess.java rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/{util => matchers}/EnrichMorePid.java (50%) rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/{util => matchers}/EnrichMoreSubject.java (51%) rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/{util => matchers}/UpdateMatcher.java (94%) create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java delete mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingOpenAccess.java delete mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingSubject.java delete mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreOpenAccess.java diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java index c4c167c13f..43ebd6dd84 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java @@ -17,18 +17,18 @@ import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.broker.model.Event; import eu.dnetlib.dhp.broker.model.EventFactory; -import eu.dnetlib.dhp.broker.oa.util.EnrichMissingAbstract; -import eu.dnetlib.dhp.broker.oa.util.EnrichMissingAuthorOrcid; -import eu.dnetlib.dhp.broker.oa.util.EnrichMissingOpenAccess; -import eu.dnetlib.dhp.broker.oa.util.EnrichMissingPid; -import eu.dnetlib.dhp.broker.oa.util.EnrichMissingProject; -import eu.dnetlib.dhp.broker.oa.util.EnrichMissingPublicationDate; -import eu.dnetlib.dhp.broker.oa.util.EnrichMissingSubject; -import eu.dnetlib.dhp.broker.oa.util.EnrichMoreOpenAccess; -import eu.dnetlib.dhp.broker.oa.util.EnrichMorePid; -import eu.dnetlib.dhp.broker.oa.util.EnrichMoreSubject; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingAbstract; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingAuthorOrcid; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingOpenAccess; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPid; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingProject; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationDate; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingSubject; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreOpenAccess; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMorePid; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreSubject; +import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; -import eu.dnetlib.dhp.broker.oa.util.UpdateMatcher; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.oaf.Result; diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAbstract.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAbstract.java similarity index 90% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAbstract.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAbstract.java index 6b6e35d1d6..43cf738f8c 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAbstract.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAbstract.java @@ -1,11 +1,12 @@ -package eu.dnetlib.dhp.broker.oa.util; +package eu.dnetlib.dhp.broker.oa.matchers; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; public class EnrichMissingAbstract extends UpdateMatcher { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAuthorOrcid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAuthorOrcid.java similarity index 91% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAuthorOrcid.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAuthorOrcid.java index d81427e05c..beeccdbe84 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAuthorOrcid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAuthorOrcid.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.broker.oa.util; +package eu.dnetlib.dhp.broker.oa.matchers; import java.util.Arrays; import java.util.List; @@ -7,6 +7,7 @@ import java.util.List; import org.apache.commons.lang3.tuple.Pair; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; public class EnrichMissingAuthorOrcid extends UpdateMatcher> { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingOpenAccess.java new file mode 100644 index 0000000000..a4a2ea0c6c --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingOpenAccess.java @@ -0,0 +1,55 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +import eu.dnetlib.broker.objects.Instance; +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingOpenAccess extends UpdateMatcher { + + public EnrichMissingOpenAccess() { + super(true); + } + + @Override + protected List> findUpdates(final Result source, final Result target) { + final long count = target + .getInstance() + .stream() + .map(i -> i.getAccessright().getClassid()) + .filter(right -> right.equals(BrokerConstants.OPEN_ACCESS)) + .count(); + + if (count > 0) { + return Arrays.asList(); + } + + return source + .getInstance() + .stream() + .filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS)) + .map(ConversionUtils::oafInstanceToBrokerInstances) + .flatMap(s -> s) + .map(i -> generateUpdateInfo(i, source, target)) + .collect(Collectors.toList()); + } + + @Override + public UpdateInfo generateUpdateInfo(final Instance highlightValue, + final Result source, + final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_OA_VERSION, + highlightValue, source, target, + (p, i) -> p.getInstances().add(i), + Instance::getUrl); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPid.java similarity index 60% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPid.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPid.java index 0b4045a0e4..a8df62541e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPid.java @@ -1,11 +1,14 @@ -package eu.dnetlib.dhp.broker.oa.util; +package eu.dnetlib.dhp.broker.oa.matchers; import java.util.Arrays; import java.util.List; +import java.util.stream.Collectors; import eu.dnetlib.broker.objects.Pid; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; public class EnrichMissingPid extends UpdateMatcher { @@ -16,8 +19,18 @@ public class EnrichMissingPid extends UpdateMatcher { @Override protected List> findUpdates(final Result source, final Result target) { - // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); - return Arrays.asList(); + final long count = target.getPid().size(); + + if (count > 0) { + return Arrays.asList(); + } + + return source + .getPid() + .stream() + .map(ConversionUtils::oafPidToBrokerPid) + .map(i -> generateUpdateInfo(i, source, target)) + .collect(Collectors.toList()); } @Override diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingProject.java similarity index 86% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingProject.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingProject.java index 45b16801c2..b6e5b3b574 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingProject.java @@ -1,11 +1,12 @@ -package eu.dnetlib.dhp.broker.oa.util; +package eu.dnetlib.dhp.broker.oa.matchers; import java.util.Arrays; import java.util.List; import eu.dnetlib.broker.objects.Project; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; public class EnrichMissingProject extends UpdateMatcher { @@ -21,7 +22,8 @@ public class EnrichMissingProject extends UpdateMatcher { } @Override - public UpdateInfo generateUpdateInfo(final Project highlightValue, final Result source, + public UpdateInfo generateUpdateInfo(final Project highlightValue, + final Result source, final Result target) { return new UpdateInfo<>( Topic.ENRICH_MISSING_PROJECT, diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPublicationDate.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java similarity index 89% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPublicationDate.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java index 7fcd2a66f9..e9ec082c46 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPublicationDate.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java @@ -1,10 +1,11 @@ -package eu.dnetlib.dhp.broker.oa.util; +package eu.dnetlib.dhp.broker.oa.matchers; import java.util.Arrays; import java.util.List; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; public class EnrichMissingPublicationDate extends UpdateMatcher { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSubject.java new file mode 100644 index 0000000000..79e9d469be --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSubject.java @@ -0,0 +1,53 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public class EnrichMissingSubject extends UpdateMatcher> { + + public EnrichMissingSubject() { + super(true); + } + + @Override + protected List>> findUpdates(final Result source, final Result target) { + final Set existingTypes = target + .getSubject() + .stream() + .map(StructuredProperty::getQualifier) + .map(Qualifier::getClassid) + .collect(Collectors.toSet()); + + return source + .getPid() + .stream() + .filter(pid -> !existingTypes.contains(pid.getQualifier().getClassid())) + .map(ConversionUtils::oafSubjectToPair) + .map(i -> generateUpdateInfo(i, source, target)) + .collect(Collectors.toList()); + } + + @Override + public UpdateInfo> generateUpdateInfo(final Pair highlightValue, + final Result source, + final Result target) { + + return new UpdateInfo<>( + Topic.fromPath("ENRICH/MISSING/SUBJECT/" + highlightValue.getLeft()), + highlightValue, source, target, + (p, pair) -> p.getSubjects().add(pair.getRight()), + pair -> pair.getLeft() + "::" + pair.getRight()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreOpenAccess.java new file mode 100644 index 0000000000..40c9b0500a --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreOpenAccess.java @@ -0,0 +1,53 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import eu.dnetlib.broker.objects.Instance; +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMoreOpenAccess extends UpdateMatcher { + + public EnrichMoreOpenAccess() { + super(true); + } + + @Override + protected List> findUpdates(final Result source, final Result target) { + final Set urls = target + .getInstance() + .stream() + .filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS)) + .map(i -> i.getUrl()) + .flatMap(List::stream) + .collect(Collectors.toSet()); + + return source + .getInstance() + .stream() + .filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS)) + .map(ConversionUtils::oafInstanceToBrokerInstances) + .flatMap(s -> s) + .filter(i -> !urls.contains(i.getUrl())) + .map(i -> generateUpdateInfo(i, source, target)) + .collect(Collectors.toList()); + } + + @Override + public UpdateInfo generateUpdateInfo(final Instance highlightValue, + final Result source, + final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MORE_OA_VERSION, + highlightValue, source, target, + (p, i) -> p.getInstances().add(i), + Instance::getUrl); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMorePid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMorePid.java similarity index 50% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMorePid.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMorePid.java index 8cd67f5536..0e7b7766ab 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMorePid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMorePid.java @@ -1,11 +1,14 @@ -package eu.dnetlib.dhp.broker.oa.util; +package eu.dnetlib.dhp.broker.oa.matchers; -import java.util.Arrays; import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; import eu.dnetlib.broker.objects.Pid; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; public class EnrichMorePid extends UpdateMatcher { @@ -16,8 +19,19 @@ public class EnrichMorePid extends UpdateMatcher { @Override protected List> findUpdates(final Result source, final Result target) { - // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); - return Arrays.asList(); + final Set existingPids = target + .getPid() + .stream() + .map(pid -> pid.getQualifier().getClassid() + "::" + pid.getValue()) + .collect(Collectors.toSet()); + + return source + .getPid() + .stream() + .filter(pid -> !existingPids.contains(pid.getQualifier().getClassid() + "::" + pid.getValue())) + .map(ConversionUtils::oafPidToBrokerPid) + .map(i -> generateUpdateInfo(i, source, target)) + .collect(Collectors.toList()); } @Override diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSubject.java similarity index 51% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreSubject.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSubject.java index 9e0d8e6939..e6374479bf 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSubject.java @@ -1,12 +1,15 @@ -package eu.dnetlib.dhp.broker.oa.util; +package eu.dnetlib.dhp.broker.oa.matchers; -import java.util.Arrays; import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; import org.apache.commons.lang3.tuple.Pair; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; public class EnrichMoreSubject extends UpdateMatcher> { @@ -17,18 +20,25 @@ public class EnrichMoreSubject extends UpdateMatcher> { @Override protected List>> findUpdates(final Result source, final Result target) { - // MESHEUROPMC - // ARXIV - // JEL - // DDC - // ACM + final Set existingSubjects = target + .getSubject() + .stream() + .map(pid -> pid.getQualifier().getClassid() + "::" + pid.getValue()) + .collect(Collectors.toSet()); - return Arrays.asList(); + return source + .getPid() + .stream() + .filter(pid -> !existingSubjects.contains(pid.getQualifier().getClassid() + "::" + pid.getValue())) + .map(ConversionUtils::oafSubjectToPair) + .map(i -> generateUpdateInfo(i, source, target)) + .collect(Collectors.toList()); } @Override public UpdateInfo> generateUpdateInfo(final Pair highlightValue, - final Result source, final Result target) { + final Result source, + final Result target) { return new UpdateInfo<>( Topic.fromPath("ENRICH/MORE/SUBJECT/" + highlightValue.getLeft()), diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateMatcher.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java similarity index 94% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateMatcher.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java index 3fd6d40276..b8b6132cd1 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateMatcher.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.broker.oa.util; +package eu.dnetlib.dhp.broker.oa.matchers; import java.util.Arrays; import java.util.Collection; @@ -10,6 +10,7 @@ import java.util.Map; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Field; import eu.dnetlib.dhp.schema.oaf.Result; diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java new file mode 100644 index 0000000000..d61d5bfb7d --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java @@ -0,0 +1,7 @@ + +package eu.dnetlib.dhp.broker.oa.util; + +public class BrokerConstants { + + public final static String OPEN_ACCESS = "OPEN"; +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java new file mode 100644 index 0000000000..2e2ce202ad --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java @@ -0,0 +1,36 @@ + +package eu.dnetlib.dhp.broker.oa.util; + +import java.util.stream.Stream; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.broker.objects.Instance; +import eu.dnetlib.broker.objects.Pid; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public class ConversionUtils { + + public static Stream oafInstanceToBrokerInstances(final eu.dnetlib.dhp.schema.oaf.Instance i) { + return i.getUrl().stream().map(url -> { + final Instance r = new Instance(); + r.setUrl(url); + r.setInstancetype(i.getInstancetype().getClassid()); + r.setLicense(BrokerConstants.OPEN_ACCESS); + r.setHostedby(i.getHostedby().getValue()); + return r; + }); + } + + public static Pid oafPidToBrokerPid(final StructuredProperty sp) { + final Pid pid = new Pid(); + pid.setValue(sp.getValue()); + pid.setType(sp.getQualifier().getClassid()); + return pid; + } + + public static final Pair oafSubjectToPair(final StructuredProperty sp) { + return Pair.of(sp.getQualifier().getClassid(), sp.getValue()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingOpenAccess.java deleted file mode 100644 index 9079ee24b0..0000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingOpenAccess.java +++ /dev/null @@ -1,33 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa.util; - -import java.util.Arrays; -import java.util.List; - -import eu.dnetlib.broker.objects.Instance; -import eu.dnetlib.dhp.broker.model.Topic; -import eu.dnetlib.dhp.schema.oaf.Result; - -public class EnrichMissingOpenAccess extends UpdateMatcher { - - public EnrichMissingOpenAccess() { - super(true); - } - - @Override - protected List> findUpdates(final Result source, final Result target) { - - return Arrays.asList(); - } - - @Override - public UpdateInfo generateUpdateInfo(final Instance highlightValue, final Result source, - final Result target) { - return new UpdateInfo<>( - Topic.ENRICH_MISSING_OA_VERSION, - highlightValue, source, target, - (p, i) -> p.getInstances().add(i), - Instance::getUrl); - } - -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingSubject.java deleted file mode 100644 index 4470bd9d98..0000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingSubject.java +++ /dev/null @@ -1,40 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa.util; - -import java.util.Arrays; -import java.util.List; - -import org.apache.commons.lang3.tuple.Pair; - -import eu.dnetlib.dhp.broker.model.Topic; -import eu.dnetlib.dhp.schema.oaf.Result; - -public class EnrichMissingSubject extends UpdateMatcher> { - - public EnrichMissingSubject() { - super(true); - } - - @Override - protected List>> findUpdates(final Result source, final Result target) { - // MESHEUROPMC - // ARXIV - // JEL - // DDC - // ACM - - return Arrays.asList(); - } - - @Override - public UpdateInfo> generateUpdateInfo(final Pair highlightValue, - final Result source, final Result target) { - - return new UpdateInfo<>( - Topic.fromPath("ENRICH/MISSING/SUBJECT/" + highlightValue.getLeft()), - highlightValue, source, target, - (p, pair) -> p.getSubjects().add(pair.getRight()), - pair -> pair.getLeft() + "::" + pair.getRight()); - } - -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreOpenAccess.java deleted file mode 100644 index bc37ce659a..0000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreOpenAccess.java +++ /dev/null @@ -1,33 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa.util; - -import java.util.Arrays; -import java.util.List; - -import eu.dnetlib.broker.objects.Instance; -import eu.dnetlib.dhp.broker.model.Topic; -import eu.dnetlib.dhp.schema.oaf.Result; - -public class EnrichMoreOpenAccess extends UpdateMatcher { - - public EnrichMoreOpenAccess() { - super(true); - } - - @Override - protected List> findUpdates(final Result source, final Result target) { - // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); - return Arrays.asList(); - } - - @Override - public UpdateInfo generateUpdateInfo(final Instance highlightValue, final Result source, - final Result target) { - return new UpdateInfo<>( - Topic.ENRICH_MORE_OA_VERSION, - highlightValue, source, target, - (p, i) -> p.getInstances().add(i), - Instance::getUrl); - } - -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java index 1dfc14e5eb..5cc0d371db 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java @@ -25,7 +25,7 @@ public final class UpdateInfo { private final float trust; - protected UpdateInfo(final Topic topic, final T highlightValue, final Result source, final Result target, + public UpdateInfo(final Topic topic, final T highlightValue, final Result source, final Result target, final BiConsumer compileHighlight, final Function highlightToString) { this.topic = topic; From cfc8948717b4f5da506af7fc3c0a3d230ecb4c69 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 12:26:16 +0200 Subject: [PATCH 17/24] fixed mapping OdfToGraph: pick the correct element to map author pids and author affiliations; extended mapping Oaf2Graph: added support for author pids --- .../dhp/oa/graph/raw/OafToOafMapper.java | 20 +++++-- .../dhp/oa/graph/raw/OdfToOafMapper.java | 8 +-- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 55 +++++++++++++++++-- .../dnetlib/dhp/oa/graph/raw/oaf_record.xml | 2 +- .../dnetlib/dhp/oa/graph/raw/odf_dataset.xml | 3 +- 5 files changed, 71 insertions(+), 17 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index 891fee57e8..6b6aa15e88 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -1,15 +1,16 @@ package eu.dnetlib.dhp.oa.graph.raw; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; +import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*; import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import java.util.*; import java.util.stream.Collectors; +import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; +import org.dom4j.Element; import org.dom4j.Node; import com.google.common.collect.Lists; @@ -28,15 +29,24 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final List res = new ArrayList<>(); int pos = 1; for (final Object o : doc.selectNodes("//dc:creator")) { - final Node n = (Node) o; + final Element e = (Element) o; final Author author = new Author(); - author.setFullname(n.getText()); + author.setFullname(e.getText()); author.setRank(pos++); - final PacePerson p = new PacePerson(n.getText(), false); + final PacePerson p = new PacePerson(e.getText(), false); if (p.isAccurate()) { author.setName(p.getNormalisedFirstName()); author.setSurname(p.getNormalisedSurname()); } + + final String pid = e.attributeValue("nameIdentifier"); + final String pidType = e.attributeValue("nameIdentifierScheme"); + + if (StringUtils.isNotBlank(pid) && StringUtils.isNotBlank(pidType)) { + author.setPid(new ArrayList<>()); + author.getPid().add(structuredProperty(pid, qualifier(pidType, pidType, DNET_PID_TYPES, DNET_PID_TYPES), info)); + } + res.add(author); } return res; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 5baac12fd0..30b980c422 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -63,17 +63,17 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { author.setSurname(surname); } - author.setAffiliation(prepareListFields(doc, "./datacite:affiliation", info)); - author.setPid(preparePids(doc, info)); + author.setAffiliation(prepareListFields(n, "./datacite:affiliation", info)); + author.setPid(preparePids(n, info)); author.setRank(pos++); res.add(author); } return res; } - private List preparePids(final Document doc, final DataInfo info) { + private List preparePids(final Node n, final DataInfo info) { final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("./datacite:nameIdentifier")) { + for (final Object o : n.selectNodes("./datacite:nameIdentifier")) { res .add( structuredProperty( diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 951c97d9de..d7635c9ea3 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -10,7 +10,10 @@ import static org.mockito.Mockito.when; import java.io.IOException; import java.util.List; import java.util.Map; +import java.util.Optional; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.*; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.BeforeEach; @@ -19,12 +22,6 @@ import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; -import eu.dnetlib.dhp.schema.oaf.Dataset; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.oaf.Software; - @ExtendWith(MockitoExtension.class) public class MappersTest { @@ -54,7 +51,26 @@ public class MappersTest { assertValidId(p.getId()); assertValidId(p.getCollectedfrom().get(0).getKey()); assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); + assertTrue(p.getAuthor().size() > 0); + Optional author = p.getAuthor() + .stream() + .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) + .findFirst(); + assertTrue(author.isPresent()); + StructuredProperty pid = author.get().getPid() + .stream() + .findFirst() + .get(); + assertEquals("0000-0001-6651-1178", pid.getValue()); + assertEquals("ORCID", pid.getQualifier().getClassid()); + assertEquals("ORCID", pid.getQualifier().getClassname()); + assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid()); + assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename()); + assertEquals("Votsi,Nefta", author.get().getFullname()); + assertEquals("Votsi", author.get().getSurname()); + assertEquals("Nefta", author.get().getName()); + assertTrue(p.getSubject().size() > 0); assertTrue(StringUtils.isNotBlank(p.getJournal().getIssnOnline())); assertTrue(StringUtils.isNotBlank(p.getJournal().getName())); @@ -100,6 +116,33 @@ public class MappersTest { assertValidId(d.getCollectedfrom().get(0).getKey()); assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue())); assertTrue(d.getAuthor().size() > 0); + + Optional author = d.getAuthor() + .stream() + .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) + .findFirst(); + assertTrue(author.isPresent()); + StructuredProperty pid = author.get().getPid() + .stream() + .findFirst() + .get(); + assertEquals("0000-0001-9074-1619", pid.getValue()); + assertEquals("ORCID", pid.getQualifier().getClassid()); + assertEquals("ORCID", pid.getQualifier().getClassname()); + assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid()); + assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename()); + assertEquals("Baracchini, Theo", author.get().getFullname()); + assertEquals("Baracchini", author.get().getSurname()); + assertEquals("Theo", author.get().getName()); + + assertEquals(1, author.get().getAffiliation().size()); + Optional> opAff = author.get().getAffiliation() + .stream() + .findFirst(); + assertTrue(opAff.isPresent()); + Field affiliation = opAff.get(); + assertEquals("ISTI-CNR", affiliation.getValue()); + assertTrue(d.getSubject().size() > 0); assertTrue(d.getInstance().size() > 0); assertTrue(d.getContext().size() > 0); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml index e898d4434f..2cb0ba1c78 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml @@ -19,7 +19,7 @@ Ecosystem Service capacity is higher in areas of multiple designation types Nikolaidou,Charitini - Votsi,Nefta + Votsi,Nefta Sgardelis,Steanos Halley,John Pantis,John diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml index 94dc802fac..88ae9d1063 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml @@ -35,9 +35,10 @@ Baracchini, Theo + 0000-0001-9074-1619 Theo Baracchini - Physics of Aquatic Systems Laboratory (APHYS) – Margaretha Kamprad Chair, ENAC, EPFL, Lausanne, 1015, Switzerland + ISTI-CNR Wüest, Alfred From 5ec8c49ad5144d056512ac25ea3a8444e71ca4f1 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 15 May 2020 12:49:58 +0200 Subject: [PATCH 18/24] removed serialization points --- .../dhp/blacklist/SparkRemoveBlacklistedRelationJob.java | 7 ------- 1 file changed, 7 deletions(-) diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java index 92289ec2d3..91bcb9d1c7 100644 --- a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java +++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java @@ -79,8 +79,6 @@ public class SparkRemoveBlacklistedRelationJob { Dataset inputRelation = readRelations(spark, inputPath); Dataset mergesRelation = readRelations(spark, mergesPath); - log.info("InputRelationCount: {}", inputRelation.count()); - Dataset dedupSource = blackListed .joinWith( mergesRelation, blackListed.col("source").equalTo(mergesRelation.col("target")), @@ -103,11 +101,6 @@ public class SparkRemoveBlacklistedRelationJob { return c._1(); }, Encoders.bean(Relation.class)); - dedupBL - .write() - .mode(SaveMode.Overwrite) - .json(blacklistPath + "/deduped"); - inputRelation .joinWith( dedupBL, (inputRelation From 7a89507ab1b1f347cdcfb46ffa8a908072aed057 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 15:16:54 +0200 Subject: [PATCH 19/24] code formatting --- .../dhp/oa/graph/raw/OafToOafMapper.java | 6 ++- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 51 +++++++++++-------- 2 files changed, 34 insertions(+), 23 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index 6b6aa15e88..54594cb803 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -7,7 +7,6 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import java.util.*; import java.util.stream.Collectors; -import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.Element; @@ -16,6 +15,7 @@ import org.dom4j.Node; import com.google.common.collect.Lists; import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; public class OafToOafMapper extends AbstractMdRecordToOafMapper { @@ -44,7 +44,9 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { if (StringUtils.isNotBlank(pid) && StringUtils.isNotBlank(pidType)) { author.setPid(new ArrayList<>()); - author.getPid().add(structuredProperty(pid, qualifier(pidType, pidType, DNET_PID_TYPES, DNET_PID_TYPES), info)); + author + .getPid() + .add(structuredProperty(pid, qualifier(pidType, pidType, DNET_PID_TYPES, DNET_PID_TYPES), info)); } res.add(author); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index d7635c9ea3..5a006e3513 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -12,8 +12,6 @@ import java.util.List; import java.util.Map; import java.util.Optional; -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.*; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.BeforeEach; @@ -22,6 +20,9 @@ import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.*; + @ExtendWith(MockitoExtension.class) public class MappersTest { @@ -53,15 +54,18 @@ public class MappersTest { assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); assertTrue(p.getAuthor().size() > 0); - Optional author = p.getAuthor() - .stream() - .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) - .findFirst(); + Optional author = p + .getAuthor() + .stream() + .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) + .findFirst(); assertTrue(author.isPresent()); - StructuredProperty pid = author.get().getPid() - .stream() - .findFirst() - .get(); + StructuredProperty pid = author + .get() + .getPid() + .stream() + .findFirst() + .get(); assertEquals("0000-0001-6651-1178", pid.getValue()); assertEquals("ORCID", pid.getQualifier().getClassid()); assertEquals("ORCID", pid.getQualifier().getClassname()); @@ -117,15 +121,18 @@ public class MappersTest { assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue())); assertTrue(d.getAuthor().size() > 0); - Optional author = d.getAuthor() - .stream() - .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) - .findFirst(); + Optional author = d + .getAuthor() + .stream() + .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) + .findFirst(); assertTrue(author.isPresent()); - StructuredProperty pid = author.get().getPid() - .stream() - .findFirst() - .get(); + StructuredProperty pid = author + .get() + .getPid() + .stream() + .findFirst() + .get(); assertEquals("0000-0001-9074-1619", pid.getValue()); assertEquals("ORCID", pid.getQualifier().getClassid()); assertEquals("ORCID", pid.getQualifier().getClassname()); @@ -136,9 +143,11 @@ public class MappersTest { assertEquals("Theo", author.get().getName()); assertEquals(1, author.get().getAffiliation().size()); - Optional> opAff = author.get().getAffiliation() - .stream() - .findFirst(); + Optional> opAff = author + .get() + .getAffiliation() + .stream() + .findFirst(); assertTrue(opAff.isPresent()); Field affiliation = opAff.get(); assertEquals("ISTI-CNR", affiliation.getValue()); From 82b615ab33be893b8527c0629b6a1e228fd80393 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 16:04:46 +0200 Subject: [PATCH 20/24] NPE check --- .../src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java index d3ae8ee4f9..a6ec364c4b 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java @@ -129,6 +129,9 @@ public class DedupUtility { .max(Comparator.comparing(Tuple2::_1)); if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) { Author r = simAuhtor.get()._2(); + if (r.getPid() == null) { + r.setPid(new ArrayList<>()); + } r.getPid().add(a._1()); } }); From 7838f2c63fb99e956aaf4bed024363b0c3f54178 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 17:06:01 +0200 Subject: [PATCH 21/24] init the empty list for author pids mapped from OAF --- .../main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index 54594cb803..ed09016da8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -42,8 +42,8 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final String pid = e.attributeValue("nameIdentifier"); final String pidType = e.attributeValue("nameIdentifierScheme"); + author.setPid(new ArrayList<>()); if (StringUtils.isNotBlank(pid) && StringUtils.isNotBlank(pidType)) { - author.setPid(new ArrayList<>()); author .getPid() .add(structuredProperty(pid, qualifier(pidType, pidType, DNET_PID_TYPES, DNET_PID_TYPES), info)); From ef9a9a9f1af74f49fe0d4dda802d33790a84c4a6 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 May 2020 22:34:19 +0200 Subject: [PATCH 22/24] remove the outout path when starting --- .../PrepareResultCommunitySetStep2.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java index 723aa89606..09340369d3 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java @@ -53,9 +53,7 @@ public class PrepareResultCommunitySetStep2 { conf, isSparkSessionManaged, spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } + removeOutputDir(spark, outputPath); mergeInfo(spark, inputPath, outputPath); }); } From b71fbb68b16cb1515614f16237d8fa8196871d29 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 18 May 2020 13:57:20 +0200 Subject: [PATCH 23/24] removed the removeOutputDir command from code. Reltions are written in Append. The erase of the output dir ment to remove all the relations computed in the prevoius steps --- .../SparkResultToOrganizationFromIstRepoJob.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java index 0ce741b873..0c5e1d8be6 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java @@ -84,7 +84,7 @@ public class SparkResultToOrganizationFromIstRepoJob { conf, isSparkSessionManaged, spark -> { - removeOutputDir(spark, outputPath); + // removeOutputDir(spark, outputPath); if (saveGraph) { execPropagation( spark, From 486e850bcc24c88ec3b36747ccae3955c01455b5 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Tue, 19 May 2020 09:24:45 +0200 Subject: [PATCH 24/24] next step of MAG conversion implemented --- .../doiboost/DoiBoostMappingUtil.scala | 104 +++++++++ .../doiboost/crossref/Crossref2Oaf.scala | 83 +------- .../dnetlib/doiboost/mag/MagDataModel.scala | 199 ++++++++++++++++-- .../doiboost/mag/SparkPreProcessMAG.scala | 91 +++++++- .../dhp/doiboost/mag/oozie_app/workflow.xml | 1 + .../oozie_app/config-default.xml | 4 + .../orcid_gen_authors/oozie_app/workflow.xml | 5 + .../dnetlib/doiboost/mag/MAGMappingTest.scala | 56 ++++- 8 files changed, 436 insertions(+), 107 deletions(-) create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala new file mode 100644 index 0000000000..68a3231e06 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala @@ -0,0 +1,104 @@ +package eu.dnetlib.doiboost + +import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, KeyValue, Qualifier, Result, StructuredProperty} +import eu.dnetlib.dhp.utils.DHPUtils + +object DoiBoostMappingUtil { + + //STATIC STRING + val MAG = "microsoft" + val ORCID = "ORCID" + val CROSSREF = "Crossref" + val UNPAYWALL = "UnpayWall" + val GRID_AC = "grid.ac" + val WIKPEDIA = "wikpedia" + val doiBoostNSPREFIX = "doiboost____" + val OPENAIRE_PREFIX = "openaire____" + val SEPARATOR = "::" + val DNET_LANGUAGES = "dnet:languages" + val PID_TYPES = "dnet:pid_types" + + + + def generateDataInfo(): DataInfo = { + val di = new DataInfo + di.setDeletedbyinference(false) + di.setInferred(false) + di.setInvisible(false) + di.setTrust("0.9") + di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions")) + di + } + + + def createSP(value: String, classId: String, schemeId: String): StructuredProperty = { + val sp = new StructuredProperty + sp.setQualifier(createQualifier(classId, schemeId)) + sp.setValue(value) + sp + + } + + def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = { + val sp = new StructuredProperty + sp.setQualifier(createQualifier(classId, schemeId)) + sp.setValue(value) + sp.setDataInfo(dataInfo) + sp + + } + + def createCrossrefCollectedFrom(): KeyValue = { + + val cf = new KeyValue + cf.setValue(CROSSREF) + cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5("crossref")) + cf + + } + + def generateIdentifier(oaf: Result, doi: String): String = { + val id = DHPUtils.md5(doi.toLowerCase) + if (oaf.isInstanceOf[Dataset]) + return s"60|${doiBoostNSPREFIX}${SEPARATOR}${id}" + s"50|${doiBoostNSPREFIX}${SEPARATOR}${id}" + } + + + + + + def createMAGCollectedFrom(): KeyValue = { + + val cf = new KeyValue + cf.setValue(MAG) + cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(MAG)) + cf + + } + + def createQualifier(clsName: String, clsValue: String, schName: String, schValue: String): Qualifier = { + val q = new Qualifier + q.setClassid(clsName) + q.setClassname(clsValue) + q.setSchemeid(schName) + q.setSchemename(schValue) + q + } + + def createQualifier(cls: String, sch: String): Qualifier = { + createQualifier(cls, cls, sch, sch) + } + + + def asField[T](value: T): Field[T] = { + val tmp = new Field[T] + tmp.setValue(value) + tmp + + + } + + + +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index 2d3b9a43a2..eda3bf17a5 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -14,6 +14,7 @@ import org.slf4j.{Logger, LoggerFactory} import scala.collection.JavaConverters._ import scala.collection.mutable import scala.util.matching.Regex +import eu.dnetlib.doiboost.DoiBoostMappingUtil._ case class mappingAffiliation(name: String) {} @@ -25,18 +26,7 @@ case class mappingFunder(name: String, DOI: Option[String], award: Option[List[S case object Crossref2Oaf { val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass) - //STATIC STRING - val MAG = "MAG" - val ORCID = "ORCID" - val CROSSREF = "Crossref" - val UNPAYWALL = "UnpayWall" - val GRID_AC = "grid.ac" - val WIKPEDIA = "wikpedia" - val doiBoostNSPREFIX = "doiboost____" - val OPENAIRE_PREFIX = "openaire____" - val SEPARATOR = "::" - val DNET_LANGUAGES = "dnet:languages" - val PID_TYPES = "dnet:pid_types" + val mappingCrossrefType = Map( "book-section" -> "publication", @@ -116,7 +106,7 @@ case object Crossref2Oaf { result.setLastupdatetimestamp((json \ "indexed" \ "timestamp").extract[Long]) result.setDateofcollection((json \ "indexed" \ "date-time").extract[String]) - result.setCollectedfrom(List(createCollectedFrom()).asJava) + result.setCollectedfrom(List(createCrossrefCollectedFrom()).asJava) // Publisher ( Name of work's publisher mapped into Result/Publisher) val publisher = (json \ "publisher").extractOrElse[String](null) @@ -168,7 +158,7 @@ case object Crossref2Oaf { result.setInstance(List(instance).asJava) instance.setInstancetype(createQualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), "dnet:publication_resource", "dnet:publication_resource")) - instance.setCollectedfrom(createCollectedFrom()) + instance.setCollectedfrom(createCrossrefCollectedFrom()) if (StringUtils.isNotBlank(issuedDate)) { instance.setDateofacceptance(asField(issuedDate)) } @@ -215,7 +205,7 @@ case object Crossref2Oaf { val funderList: List[mappingFunder] = (json \ "funder").extractOrElse[List[mappingFunder]](List()) if (funderList.nonEmpty) { - resultList = resultList ::: mappingFunderToRelations(funderList, result.getId, createCollectedFrom(), result.getDataInfo, result.getLastupdatetimestamp) + resultList = resultList ::: mappingFunderToRelations(funderList, result.getId, createCrossrefCollectedFrom(), result.getDataInfo, result.getLastupdatetimestamp) } @@ -416,71 +406,8 @@ case object Crossref2Oaf { } - def generateIdentifier(oaf: Result, doi: String): String = { - val id = DHPUtils.md5(doi.toLowerCase) - if (oaf.isInstanceOf[Dataset]) - return s"60|${doiBoostNSPREFIX}${SEPARATOR}${id}" - s"50|${doiBoostNSPREFIX}${SEPARATOR}${id}" - } - - def asField[T](value: T): Field[T] = { - val tmp = new Field[T] - tmp.setValue(value) - tmp - } - - - def generateDataInfo(): DataInfo = { - val di = new DataInfo - di.setDeletedbyinference(false) - di.setInferred(false) - di.setInvisible(false) - di.setTrust("0.9") - di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions")) - di - } - - - def createSP(value: String, classId: String, schemeId: String): StructuredProperty = { - val sp = new StructuredProperty - sp.setQualifier(createQualifier(classId, schemeId)) - sp.setValue(value) - sp - - } - - def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = { - val sp = new StructuredProperty - sp.setQualifier(createQualifier(classId, schemeId)) - sp.setValue(value) - sp.setDataInfo(dataInfo) - sp - - } - - def createCollectedFrom(): KeyValue = { - - val cf = new KeyValue - cf.setValue(CROSSREF) - cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5("crossref")) - cf - - } - - def createQualifier(clsName: String, clsValue: String, schName: String, schValue: String): Qualifier = { - val q = new Qualifier - q.setClassid(clsName) - q.setClassname(clsValue) - q.setSchemeid(schName) - q.setSchemename(schValue) - q - } - - def createQualifier(cls: String, sch: String): Qualifier = { - createQualifier(cls, cls, sch, sch) - } def generateItemFromType(objectType: String, objectSubType: String): Result = { diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala index 189e90ed9a..17f0395ca0 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala @@ -1,52 +1,215 @@ package eu.dnetlib.doiboost.mag +import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication} import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse +import eu.dnetlib.doiboost.DoiBoostMappingUtil._ + +import scala.collection.JavaConverters._ +import scala.collection.mutable +import scala.util.matching.Regex -case class Papers(PaperId:Long, Rank:Integer, Doi:String, - DocType:String, PaperTitle:String, OriginalTitle:String, - BookTitle:String, Year:Option[Integer], Date:Option[java.sql.Timestamp], Publisher:String, - JournalId:Option[Long], ConferenceSeriesId:Option[Long], ConferenceInstanceId:Option[Long], - Volume:String, Issue:String, FirstPage:String, LastPage:String, - ReferenceCount:Option[Long], CitationCount:Option[Long], EstimatedCitation:Option[Long], - OriginalVenue:String, FamilyId:Option[Long], CreatedDate:java.sql.Timestamp) {} +case class MagPapers(PaperId: Long, Rank: Integer, Doi: String, + DocType: String, PaperTitle: String, OriginalTitle: String, + BookTitle: String, Year: Option[Integer], Date: Option[java.sql.Timestamp], Publisher: String, + JournalId: Option[Long], ConferenceSeriesId: Option[Long], ConferenceInstanceId: Option[Long], + Volume: String, Issue: String, FirstPage: String, LastPage: String, + ReferenceCount: Option[Long], CitationCount: Option[Long], EstimatedCitation: Option[Long], + OriginalVenue: String, FamilyId: Option[Long], CreatedDate: java.sql.Timestamp) {} -case class PaperAbstract(PaperId:Long,IndexedAbstract:String) {} +case class MagPaperAbstract(PaperId: Long, IndexedAbstract: String) {} +case class MagAuthor(AuthorId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], LastKnownAffiliationId: Option[Long], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {} + +case class MagAffiliation(AffiliationId: Long, Rank: Int, NormalizedName: String, DisplayName: String, GridId: String, OfficialPage: String, WikiPage: String, PaperCount: Long, CitationCount: Long, Latitude: Option[Float], Longitude: Option[Float], CreatedDate: java.sql.Timestamp) {} + +case class MagPaperAuthorAffiliation(PaperId: Long, AuthorId: Long, AffiliationId: Option[Long], AuthorSequenceNumber: Int, OriginalAuthor: String, OriginalAffiliation: String) {} + + +case class MagAuthorAffiliation(author: MagAuthor, affiliation:String) + +case class MagPaperWithAuthorList(PaperId: Long, authors: List[MagAuthorAffiliation]) {} + +case class MagPaperAuthorDenormalized(PaperId: Long, author: MagAuthor, affiliation:String) {} + +case class MagPaperUrl(PaperId: Long, SourceType: Option[Int], SourceUrl: Option[String], LanguageCode: Option[String]) {} + +case class MagUrl(PaperId: Long, instances: List[String]) + + +case class MagJournal(JournalId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], Issn: Option[String], Publisher: Option[String], Webpage: Option[String], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {} case object ConversionUtil { + def extractMagIdentifier(pids:mutable.Buffer[String]) :String ={ + val magIDRegex: Regex = "^[0-9]+$".r + val s =pids.filter(p=> magIDRegex.findAllIn(p).hasNext) - - def transformPaperAbstract(input:PaperAbstract) : PaperAbstract = { - PaperAbstract(input.PaperId, convertInvertedIndexString(input.IndexedAbstract)) + if (s.nonEmpty) + return s.head + null } - def convertInvertedIndexString(json_input:String) :String = { + def addInstances(a: (Publication, MagUrl)): Publication = { + val pub = a._1 + val urls = a._2 + + + val i = new Instance + + + if (urls!= null) { + + val l:List[String] = urls.instances.filter(k=>k.nonEmpty):::List(s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}") + + i.setUrl(l.asJava) + } + else + i.setUrl(List(s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}").asJava) + + i.setCollectedfrom(createMAGCollectedFrom()) + pub.setInstance(List(i).asJava) + pub + } + + + def transformPaperAbstract(input: MagPaperAbstract): MagPaperAbstract = { + MagPaperAbstract(input.PaperId, convertInvertedIndexString(input.IndexedAbstract)) + } + + + def createOAFFromJournalAuthorPaper(inputParams: ((MagPapers, MagJournal), MagPaperWithAuthorList)): Publication = { + val paper = inputParams._1._1 + val journal = inputParams._1._2 + val authors = inputParams._2 + + val pub = new Publication + pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", PID_TYPES)).asJava) + pub.setOriginalId(List(paper.PaperId.toString, paper.Doi.toLowerCase).asJava) + + //Set identifier as {50|60} | doiboost____::md5(DOI) + pub.setId(generateIdentifier(pub, paper.Doi.toLowerCase)) + + val mainTitles = createSP(paper.PaperTitle, "main title", "dnet:dataCite_title") + val originalTitles = createSP(paper.OriginalTitle, "alternative title", "dnet:dataCite_title") + pub.setTitle(List(mainTitles, originalTitles).asJava) + + pub.setSource(List(asField(paper.BookTitle)).asJava) + + val authorsOAF = authors.authors.map { f: MagAuthorAffiliation => + + val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author + + a.setFullname(f.author.DisplayName.get) + + if(f.affiliation!= null) + a.setAffiliation(List(asField(f.affiliation)).asJava) + a.setPid(List(createSP(s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", "URL", PID_TYPES)).asJava) + a + } + pub.setAuthor(authorsOAF.asJava) + + + if (paper.Date != null && paper.Date.isDefined) { + pub.setDateofacceptance(asField(paper.Date.get.toString)) + } + pub.setPublisher(asField(paper.Publisher)) + + + if (journal != null && journal.DisplayName.isDefined) { + val j = new Journal + + j.setName(journal.DisplayName.get) + j.setSp(paper.FirstPage) + j.setEp(paper.LastPage) + if (journal.Publisher.isDefined) + j.setEdition(journal.Publisher.get) + if (journal.Issn.isDefined) + j.setIssnPrinted(journal.Issn.get) + pub.setJournal(j) + } + pub + } + + + def createOAF(inputParams: ((MagPapers, MagPaperWithAuthorList), MagPaperAbstract)): Publication = { + + val paper = inputParams._1._1 + val authors = inputParams._1._2 + val description = inputParams._2 + + val pub = new Publication + pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", PID_TYPES)).asJava) + pub.setOriginalId(List(paper.PaperId.toString, paper.Doi.toLowerCase).asJava) + + //Set identifier as {50|60} | doiboost____::md5(DOI) + pub.setId(generateIdentifier(pub, paper.Doi.toLowerCase)) + + val mainTitles = createSP(paper.PaperTitle, "main title", "dnet:dataCite_title") + val originalTitles = createSP(paper.OriginalTitle, "alternative title", "dnet:dataCite_title") + pub.setTitle(List(mainTitles, originalTitles).asJava) + + pub.setSource(List(asField(paper.BookTitle)).asJava) + + + if (description != null) { + pub.setDescription(List(asField(description.IndexedAbstract)).asJava) + } + + + val authorsOAF = authors.authors.map { f: MagAuthorAffiliation => + + val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author + + a.setFullname(f.author.DisplayName.get) + + if(f.affiliation!= null) + a.setAffiliation(List(asField(f.affiliation)).asJava) + + + a.setPid(List(createSP(s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", "URL", PID_TYPES)).asJava) + + a + + } + + + if (paper.Date != null) { + pub.setDateofacceptance(asField(paper.Date.toString)) + } + + pub.setAuthor(authorsOAF.asJava) + + + pub + + } + + + def convertInvertedIndexString(json_input: String): String = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(json_input) - - - val idl = (json \ "IndexLength").extract[Int] - if (idl > 0) { val res = Array.ofDim[String](idl) val iid = (json \ "InvertedIndex").extract[Map[String, List[Int]]] - for {(k:String,v:List[Int]) <- iid}{ + for {(k: String, v: List[Int]) <- iid} { v.foreach(item => res(item) = k) } + (0 until idl).foreach(i => { + if (res(i) == null) + res(i) = "" + }) return res.mkString(" ") - } "" } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkPreProcessMAG.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkPreProcessMAG.scala index 4c014a95cf..a0e20be1aa 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkPreProcessMAG.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkPreProcessMAG.scala @@ -1,13 +1,17 @@ package eu.dnetlib.doiboost.mag import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.schema.oaf.Publication +import eu.dnetlib.doiboost.DoiBoostMappingUtil.asField import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{Dataset, SaveMode, SparkSession} +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.slf4j.{Logger, LoggerFactory} import org.apache.spark.sql.functions._ +import scala.collection.JavaConverters._ + object SparkPreProcessMAG { @@ -23,15 +27,21 @@ object SparkPreProcessMAG { .config(conf) .appName(getClass.getSimpleName) .master(parser.get("master")).getOrCreate() + + val sourcePath = parser.get("sourcePath") import spark.implicits._ + implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication] + implicit val tupleForJoinEncoder = Encoders.tuple(Encoders.STRING, mapEncoderPubs) + + logger.info("Phase 1) make uninque DOI in Papers:") - val d: Dataset[Papers] = spark.read.load(s"${parser.get("sourcePath")}/Papers").as[Papers] + val d: Dataset[MagPapers] = spark.read.load(s"${parser.get("sourcePath")}/Papers").as[MagPapers] // Filtering Papers with DOI, and since for the same DOI we have multiple version of item with different PapersId we get the last one - val result: RDD[Papers] = d.where(col("Doi").isNotNull).rdd.map { p: Papers => Tuple2(p.Doi, p) }.reduceByKey { case (p1: Papers, p2: Papers) => + val result: RDD[MagPapers] = d.where(col("Doi").isNotNull).rdd.map { p: MagPapers => Tuple2(p.Doi, p) }.reduceByKey { case (p1: MagPapers, p2: MagPapers) => var r = if (p1 == null) p2 else p1 if (p1 != null && p2 != null) { if (p1.CreatedDate != null && p2.CreatedDate != null) { @@ -46,16 +56,83 @@ object SparkPreProcessMAG { r }.map(_._2) - val distinctPaper: Dataset[Papers] = spark.createDataset(result) + val distinctPaper: Dataset[MagPapers] = spark.createDataset(result) distinctPaper.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/Papers_distinct") logger.info(s"Total number of element: ${result.count()}") - logger.info("Phase 2) convert InverdIndex Abastrac to string") - val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[PaperAbstract] + logger.info("Phase 3) Group Author by PaperId") + val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor] + + val affiliation =spark.read.load(s"$sourcePath/Affiliations").as[MagAffiliation] + + val paperAuthorAffiliation =spark.read.load(s"$sourcePath/PaperAuthorAffiliations").as[MagPaperAuthorAffiliation] + + + paperAuthorAffiliation.joinWith(authors, paperAuthorAffiliation("AuthorId").equalTo(authors("AuthorId"))) + .map{case (a:MagPaperAuthorAffiliation,b:MagAuthor )=> (a.AffiliationId,MagPaperAuthorDenormalized(a.PaperId, b, null)) } + .joinWith(affiliation, affiliation("AffiliationId").equalTo(col("_1")), "left") + .map(s => { + val mpa = s._1._2 + val af = s._2 + if (af!= null) { + MagPaperAuthorDenormalized(mpa.PaperId, mpa.author, af.DisplayName) + } else + mpa + }).groupBy("PaperId").agg(collect_list(struct($"author", $"affiliation")).as("authors")) + .write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_1_paper_authors") + + + + logger.info("Phase 4) create First Version of publication Entity with Paper Journal and Authors") + + + val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal] + + val papers =spark.read.load((s"${parser.get("targetPath")}/Papers_distinct")).as[MagPapers] + + val paperWithAuthors = spark.read.load(s"${parser.get("targetPath")}/merge_step_1_paper_authors").as[MagPaperWithAuthorList] + + + + val firstJoin =papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")),"left") + firstJoin.joinWith(paperWithAuthors, firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")), "left") + .map { a: ((MagPapers, MagJournal), MagPaperWithAuthorList) => ConversionUtil.createOAFFromJournalAuthorPaper(a) }.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_2") + + + + var magPubs:Dataset[(String,Publication)] = spark.read.load(s"${parser.get("targetPath")}/merge_step_2").as[Publication].map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String,Publication)] + + val paperUrlDataset = spark.read.load(s"$sourcePath/PaperUrls").as[MagPaperUrl].groupBy("PaperId").agg(collect_list(struct("sourceUrl")).as("instances")).as[MagUrl] + + + logger.info("Phase 5) enrich publication with URL and Instances") + + magPubs.joinWith(paperUrlDataset, col("_1").equalTo(paperUrlDataset("PaperId")), "left") + .map{a:((String,Publication), MagUrl) => ConversionUtil.addInstances((a._1._2, a._2))} + .write.mode(SaveMode.Overwrite) + .save(s"${parser.get("targetPath")}/merge_step_3") + + + + logger.info("Phase 6) Enrich Publication with description") + val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract] pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract") + val paperAbstract =spark.read.load((s"${parser.get("targetPath")}/PaperAbstract")).as[MagPaperAbstract] - distinctPaper.joinWith(pa, col("PaperId").eqia) + + magPubs = spark.read.load(s"${parser.get("targetPath")}/merge_step_3").as[Publication].map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String,Publication)] + + magPubs.joinWith(paperAbstract,col("_1").equalTo(paperAbstract("PaperId")), "left").map(p=> + { + val pub = p._1._2 + val abst = p._2 + if (abst!= null) { + pub.setDescription(List(asField(abst.IndexedAbstract)).asJava) + } + pub + } + ).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_4") } diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/mag/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/mag/oozie_app/workflow.xml index ba6eea3646..2277b79b0f 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/mag/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/mag/oozie_app/workflow.xml @@ -72,6 +72,7 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 ${sparkExtraOPT} --sourcePath${sourcePath} diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/config-default.xml index 3726022cbf..a720e75923 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/config-default.xml @@ -11,6 +11,10 @@ queueName default + + oozie.use.system.libpath + true + oozie.action.sharelib.for.spark spark2 diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/workflow.xml index a4d65ed000..f258fae6e8 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_gen_authors/oozie_app/workflow.xml @@ -16,6 +16,10 @@ sparkExecutorCores number of cores used by single executor + + outputPath + the working dir base path + @@ -47,6 +51,7 @@ -mt yarn --workingPath_orcid${workingPath_activities}/ + -o${outputPath}/ diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala index 0aaaeb3777..4d26969dd9 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala @@ -1,10 +1,21 @@ package eu.dnetlib.doiboost.mag -import org.codehaus.jackson.map.ObjectMapper +import eu.dnetlib.dhp.schema.oaf.Publication +import org.apache.htrace.fasterxml.jackson.databind.SerializationFeature +import org.apache.spark.SparkConf +import org.apache.spark.api.java.function.MapFunction +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig} import org.junit.jupiter.api.Test import org.slf4j.{Logger, LoggerFactory} import org.junit.jupiter.api.Assertions._ +import org.apache.spark.sql.functions._ + +import scala.collection.JavaConverters._ import scala.io.Source +import scala.reflect.ClassTag +import scala.util.matching.Regex + class MAGMappingTest { @@ -13,14 +24,49 @@ class MAGMappingTest { val mapper = new ObjectMapper() - //@Test + @Test def testMAGCSV(): Unit = { - SparkPreProcessMAG.main("-m local[*] -s /data/doiboost/mag/datasets -t /data/doiboost/mag/datasets/preprocess".split(" ")) + // SparkPreProcessMAG.main("-m local[*] -s /data/doiboost/mag/datasets -t /data/doiboost/mag/datasets/preprocess".split(" ")) + + val sparkConf: SparkConf = new SparkConf + + val spark: SparkSession = SparkSession.builder() + .config(sparkConf) + .appName(getClass.getSimpleName) + .master("local[*]") + .getOrCreate() + + import spark.implicits._ + + + implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication] + implicit val longBarEncoder = Encoders.tuple(Encoders.STRING, mapEncoderPubs) + + val sourcePath = "/data/doiboost/mag/input" + + mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) + + + val magOAF = spark.read.load("$sourcePath/merge_step_4").as[Publication] + + println(magOAF.first().getOriginalId) + + + magOAF.map(k => (ConversionUtil.extractMagIdentifier(k.getOriginalId.asScala),k)).as[(String,Publication)].show() + + + println((ConversionUtil.extractMagIdentifier(magOAF.first().getOriginalId.asScala))) + + val magIDRegex: Regex = "^[0-9]+$".r + + + println(magIDRegex.findFirstMatchIn("suca").isDefined) + } @Test - def buildInvertedIndexTest() :Unit = { + def buildInvertedIndexTest(): Unit = { val json_input = Source.fromInputStream(getClass.getResourceAsStream("invertedIndex.json")).mkString val description = ConversionUtil.convertInvertedIndexString(json_input) assertNotNull(description) @@ -32,3 +78,5 @@ class MAGMappingTest { } + +