From d0ac7514b225d08a89159fa59e2ad3a79bacaf42 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 18 Jun 2020 19:37:25 +0200 Subject: [PATCH 01/37] cleaning workflow to include cleaning of default values --- .../oa/graph/clean/CleanGraphSparkJob.java | 77 +++++++++++++++++++ .../dhp/oa/graph/clean/CleaningRuleMap.java | 34 ++++---- .../oa/graph/raw/common/VocabularyGroup.java | 6 +- .../oa/graph/clean/CleaningFunctionTest.java | 7 ++ .../eu/dnetlib/dhp/oa/graph/clean/result.json | 6 ++ 5 files changed, 114 insertions(+), 16 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java index b2c7152d5..c90898814 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java @@ -3,9 +3,13 @@ package eu.dnetlib.dhp.oa.graph.clean; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import java.io.BufferedInputStream; +import java.util.Objects; import java.util.Optional; +import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; @@ -19,7 +23,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @@ -84,12 +90,83 @@ public class CleanGraphSparkJob { readTableFromPath(spark, inputPath, clazz) .map((MapFunction) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz)) + .map((MapFunction) value -> fixDefaults(value), Encoders.bean(clazz)) .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(outputPath); } + private static T fixDefaults(T value) { + if (value instanceof Datasource) { + // nothing to clean here + } else if (value instanceof Project) { + // nothing to clean here + } else if (value instanceof Organization) { + Organization o = (Organization) value; + if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) { + o.setCountry(qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_COUNTRY_TYPE)); + } + } else if (value instanceof Relation) { + // nothing to clean here + } else if (value instanceof Result) { + + Result r = (Result) value; + if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) { + r + .setLanguage( + qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES)); + } + if (Objects.nonNull(r.getSubject())) { + r + .setSubject( + r + .getSubject() + .stream() + .filter(Objects::nonNull) + .filter(sp -> StringUtils.isNotBlank(sp.getValue())) + .filter(sp -> Objects.nonNull(sp.getQualifier())) + .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) + .collect(Collectors.toList())); + } + if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) { + r + .setResourcetype( + qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE)); + } + if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) { + r + .setBestaccessright( + qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES)); + } + if (Objects.nonNull(r.getInstance())) { + for (Instance i : r.getInstance()) { + if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) { + i.setAccessright(qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES)); + } + } + } + + if (value instanceof Publication) { + + } else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) { + + } else if (value instanceof OtherResearchProduct) { + + } else if (value instanceof Software) { + + } + } + + return value; + } + + private static Qualifier qualifier(String classid, String classname, String scheme) { + return OafMapperUtils + .qualifier( + classid, classname, scheme, scheme); + } + private static Dataset readTableFromPath( SparkSession spark, String inputEntityPath, Class clazz) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java index 8006f7300..d2d4e118f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java @@ -4,10 +4,13 @@ package eu.dnetlib.dhp.oa.graph.clean; import java.io.Serializable; import java.util.HashMap; +import org.apache.commons.lang3.StringUtils; + import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.Country; import eu.dnetlib.dhp.schema.oaf.Qualifier; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; public class CleaningRuleMap extends HashMap> implements Serializable { @@ -18,23 +21,24 @@ public class CleaningRuleMap extends HashMap */ public static CleaningRuleMap create(VocabularyGroup vocabularies) { CleaningRuleMap mapping = new CleaningRuleMap(); - mapping.put(Qualifier.class, o -> { - Qualifier q = (Qualifier) o; - if (vocabularies.vocabularyExists(q.getSchemeid())) { - Qualifier newValue = vocabularies.lookup(q.getSchemeid(), q.getClassid()); - q.setClassid(newValue.getClassid()); - q.setClassname(newValue.getClassname()); + mapping.put(Qualifier.class, o -> cleanQualifier(vocabularies, (Qualifier) o)); + mapping.put(Country.class, o -> { + final Country c = (Country) o; + if (StringUtils.isBlank(c.getSchemeid())) { + c.setSchemeid(ModelConstants.DNET_COUNTRY_TYPE); + c.setSchemename(ModelConstants.DNET_COUNTRY_TYPE); } - }); - mapping.put(StructuredProperty.class, o -> { - StructuredProperty sp = (StructuredProperty) o; - // TODO implement a policy - /* - * if (StringUtils.isBlank(sp.getValue())) { sp.setValue(null); sp.setQualifier(null); sp.setDataInfo(null); - * } - */ + cleanQualifier(vocabularies, c); }); return mapping; } + private static void cleanQualifier(VocabularyGroup vocabularies, Q q) { + if (vocabularies.vocabularyExists(q.getSchemeid())) { + Qualifier newValue = vocabularies.lookup(q.getSchemeid(), q.getClassid()); + q.setClassid(newValue.getClassid()); + q.setClassname(newValue.getClassname()); + } + } + } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java index d9ff62596..334339d3b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java @@ -122,7 +122,11 @@ public class VocabularyGroup implements Serializable { } public boolean vocabularyExists(final String vocId) { - return vocs.containsKey(vocId.toLowerCase()); + return Optional + .ofNullable(vocId) + .map(String::toLowerCase) + .map(id -> vocs.containsKey(id)) + .orElse(false); } private void addSynonyms(final String vocId, final String termId, final String syn) { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java index 1b21ce2d3..4783aa81f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java @@ -21,6 +21,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @@ -56,6 +57,9 @@ public class CleaningFunctionTest { String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json")); Publication p_in = MAPPER.readValue(json, Publication.class); + assertTrue(p_in instanceof Result); + assertTrue(p_in instanceof Publication); + Publication p_out = OafCleaner.apply(p_in, mapping); assertNotNull(p_out); @@ -63,6 +67,9 @@ public class CleaningFunctionTest { assertEquals("und", p_out.getLanguage().getClassid()); assertEquals("Undetermined", p_out.getLanguage().getClassname()); + assertEquals("DE", p_out.getCountry().get(0).getClassid()); + assertEquals("Germany", p_out.getCountry().get(0).getClassname()); + assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid()); assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname()); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index b63a12f61..2c1d5017d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -202,6 +202,12 @@ "contributor": [ ], "country": [ + { + "classid": "DE", + "classname": "DE", + "schemeid": "dnet:countries", + "schemename": "dnet:countries" + } ], "coverage": [ ], From 834f139e6e89490972343e0436c291c237520735 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Fri, 19 Jun 2020 12:33:29 +0200 Subject: [PATCH 02/37] fixed some NPE --- dhp-workflows/dhp-broker-events/pom.xml | 2 +- .../broker/oa/GenerateEventsApplication.java | 23 +- .../dhp/broker/oa/util/ConversionUtils.java | 284 +++++++++++------- .../dhp/broker/oa/util/UpdateInfo.java | 14 +- 4 files changed, 199 insertions(+), 124 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/pom.xml b/dhp-workflows/dhp-broker-events/pom.xml index cd3257991..f943ac93a 100644 --- a/dhp-workflows/dhp-broker-events/pom.xml +++ b/dhp-workflows/dhp-broker-events/pom.xml @@ -53,7 +53,7 @@ eu.dnetlib dnet-openaire-broker-common - [3.0.1,4.0.0) + [3.0.2,4.0.0) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java index 3357710f0..ae313813d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java @@ -30,11 +30,9 @@ import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.OpenaireBrokerResultAggregator; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedEntityFactory; import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; -import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.pace.config.DedupConfig; @@ -85,7 +83,9 @@ public class GenerateEventsApplication { removeOutputDir(spark, eventsPath); // TODO REMOVE THIS - expandResultsWithRelations(spark, graphPath, Publication.class) + readPath(spark, graphPath + "/publication", Publication.class) + .filter(r -> r.getDataInfo().getDeletedbyinference()) + .map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OpenaireBrokerResult.class)) .write() .mode(SaveMode.Overwrite) .json(eventsPath); @@ -141,15 +141,15 @@ public class GenerateEventsApplication { final String graphPath, final Class sourceClass) { - final Dataset projects = readPath(spark, graphPath + "/project", Project.class); - final Dataset datasets = readPath( - spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class); - final Dataset softwares = readPath(spark, graphPath + "/software", Software.class); - final Dataset publications = readPath(spark, graphPath + "/publication", Publication.class); + // final Dataset projects = readPath(spark, graphPath + "/project", Project.class); + // final Dataset datasets = readPath( + // spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class); + // final Dataset softwares = readPath(spark, graphPath + "/software", Software.class); + // final Dataset publications = readPath(spark, graphPath + "/publication", Publication.class); - final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class) - .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) - .cache(); + // final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class) + // .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) + // .cache(); final Dataset r0 = readPath( spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass) @@ -185,7 +185,6 @@ public class GenerateEventsApplication { final TypedColumn, OpenaireBrokerResult> aggr = new OpenaireBrokerResultAggregator() .toColumn(); - ; return sources .joinWith(typedRels, sources.col("openaireId").equalTo(rels.col("source")), "left_outer") diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java index d04ef45a0..d8f9dffbe 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.broker.oa.util; import java.util.ArrayList; import java.util.List; +import java.util.Objects; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; @@ -13,6 +14,8 @@ import org.dom4j.DocumentHelper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.base.Function; + import eu.dnetlib.broker.objects.OpenaireBrokerResult; import eu.dnetlib.broker.objects.TypedValue; import eu.dnetlib.dhp.schema.oaf.Author; @@ -24,6 +27,7 @@ import eu.dnetlib.dhp.schema.oaf.Journal; import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; @@ -33,133 +37,186 @@ public class ConversionUtils { private static final Logger log = LoggerFactory.getLogger(ConversionUtils.class); public static List oafInstanceToBrokerInstances(final Instance i) { - return i.getUrl().stream().map(url -> { - return new eu.dnetlib.broker.objects.Instance() - .setUrl(url) - .setInstancetype(i.getInstancetype().getClassid()) - .setLicense(BrokerConstants.OPEN_ACCESS) - .setHostedby(i.getHostedby().getValue()); - }).collect(Collectors.toList()); + if (i == null) { + return new ArrayList<>(); + } + + return mappedList(i.getUrl(), url -> { + final eu.dnetlib.broker.objects.Instance res = new eu.dnetlib.broker.objects.Instance(); + res.setUrl(url); + res.setInstancetype(classId(i.getInstancetype())); + res.setLicense(BrokerConstants.OPEN_ACCESS); + res.setHostedby(kvValue(i.getHostedby())); + return res; + }); } public static TypedValue oafPidToBrokerPid(final StructuredProperty sp) { - return sp != null ? new TypedValue() - .setValue(sp.getValue()) - .setType(sp.getQualifier().getClassid()) : null; + return oafStructPropToBrokerTypedValue(sp); + } + + public static TypedValue oafStructPropToBrokerTypedValue(final StructuredProperty sp) { + return sp != null ? new TypedValue(classId(sp.getQualifier()), sp.getValue()) : null; } public static final Pair oafSubjectToPair(final StructuredProperty sp) { - return sp != null ? Pair.of(sp.getQualifier().getClassid(), sp.getValue()) : null; + return sp != null ? Pair.of(classId(sp.getQualifier()), sp.getValue()) : null; } public static final eu.dnetlib.broker.objects.Dataset oafDatasetToBrokerDataset(final Dataset d) { - return d != null ? new eu.dnetlib.broker.objects.Dataset() - .setOriginalId(d.getOriginalId().get(0)) - .setTitle(structPropValue(d.getTitle())) - .setPids(d.getPid().stream().map(ConversionUtils::oafPidToBrokerPid).collect(Collectors.toList())) - .setInstances( - d - .getInstance() - .stream() - .map(ConversionUtils::oafInstanceToBrokerInstances) - .flatMap(List::stream) - .collect(Collectors.toList())) - .setCollectedFrom(d.getCollectedfrom().stream().map(KeyValue::getValue).findFirst().orElse(null)) - : null; + if (d == null) { + return null; + } + + final eu.dnetlib.broker.objects.Dataset res = new eu.dnetlib.broker.objects.Dataset(); + res.setOriginalId(first(d.getOriginalId())); + res.setTitle(structPropValue(d.getTitle())); + res.setPids(mappedList(d.getPid(), ConversionUtils::oafPidToBrokerPid)); + res.setInstances(flatMappedList(d.getInstance(), ConversionUtils::oafInstanceToBrokerInstances)); + res.setCollectedFrom(mappedFirst(d.getCollectedfrom(), KeyValue::getValue)); + return res; } public static eu.dnetlib.broker.objects.Publication oafPublicationToBrokerPublication(final Publication p) { - return p != null ? new eu.dnetlib.broker.objects.Publication() - .setOriginalId(p.getOriginalId().get(0)) - .setTitle(structPropValue(p.getTitle())) - .setPids(p.getPid().stream().map(ConversionUtils::oafPidToBrokerPid).collect(Collectors.toList())) - .setInstances( - p - .getInstance() - .stream() - .map(ConversionUtils::oafInstanceToBrokerInstances) - .flatMap(List::stream) - .collect(Collectors.toList())) - .setCollectedFrom(p.getCollectedfrom().stream().map(KeyValue::getValue).findFirst().orElse(null)) - : null; + if (p == null) { + return null; + } + + final eu.dnetlib.broker.objects.Publication res = new eu.dnetlib.broker.objects.Publication(); + res.setOriginalId(first(p.getOriginalId())); + res.setTitle(structPropValue(p.getTitle())); + res.setPids(mappedList(p.getPid(), ConversionUtils::oafPidToBrokerPid)); + res.setInstances(flatMappedList(p.getInstance(), ConversionUtils::oafInstanceToBrokerInstances)); + res.setCollectedFrom(mappedFirst(p.getCollectedfrom(), KeyValue::getValue)); + + return res; } public static final OpenaireBrokerResult oafResultToBrokerResult(final Result result) { + if (result == null) { + return null; + } - return result != null ? new OpenaireBrokerResult() - .setOpenaireId(result.getId()) - .setOriginalId(result.getOriginalId().get(0)) - .setTypology(result.getResulttype().getClassid()) - .setTitles(structPropList(result.getTitle())) - .setAbstracts(fieldList(result.getDescription())) - .setLanguage(result.getLanguage().getClassid()) - .setSubjects(structPropTypedList(result.getSubject())) - .setCreators( - result.getAuthor().stream().map(ConversionUtils::oafAuthorToBrokerAuthor).collect(Collectors.toList())) - .setPublicationdate(result.getDateofacceptance().getValue()) - .setPublisher(fieldValue(result.getPublisher())) - .setEmbargoenddate(fieldValue(result.getEmbargoenddate())) - .setContributor(fieldList(result.getContributor())) + final OpenaireBrokerResult res = new OpenaireBrokerResult(); + + res.setOpenaireId(result.getId()); + res.setOriginalId(first(result.getOriginalId())); + res.setTypology(classId(result.getResulttype())); + res.setTitles(structPropList(result.getTitle())); + res.setAbstracts(fieldList(result.getDescription())); + res.setLanguage(classId(result.getLanguage())); + res.setSubjects(structPropTypedList(result.getSubject())); + res.setCreators(mappedList(result.getAuthor(), ConversionUtils::oafAuthorToBrokerAuthor)); + res.setPublicationdate(fieldValue(result.getDateofacceptance())); + res.setPublisher(fieldValue(result.getPublisher())); + res.setEmbargoenddate(fieldValue(result.getEmbargoenddate())); + res.setContributor(fieldList(result.getContributor())); + res .setJournal( - result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null) - .setCollectedFromId(result.getCollectedfrom().stream().map(KeyValue::getKey).findFirst().orElse(null)) - .setCollectedFromName(result.getCollectedfrom().stream().map(KeyValue::getValue).findFirst().orElse(null)) - .setPids(result.getPid().stream().map(ConversionUtils::oafPidToBrokerPid).collect(Collectors.toList())) - .setInstances( - result - .getInstance() - .stream() - .map(ConversionUtils::oafInstanceToBrokerInstances) - .flatMap(List::stream) - .collect(Collectors.toList())) - .setExternalReferences( - result - .getExternalReference() - .stream() - .map(ConversionUtils::oafExtRefToBrokerExtRef) - .collect(Collectors.toList())) - : null; + result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null); + res.setCollectedFromId(mappedFirst(result.getCollectedfrom(), KeyValue::getKey)); + res.setCollectedFromName(mappedFirst(result.getCollectedfrom(), KeyValue::getValue)); + res.setPids(mappedList(result.getPid(), ConversionUtils::oafPidToBrokerPid)); + res.setInstances(flatMappedList(result.getInstance(), ConversionUtils::oafInstanceToBrokerInstances)); + res.setExternalReferences(mappedList(result.getExternalReference(), ConversionUtils::oafExtRefToBrokerExtRef)); + + return res; } private static List structPropTypedList(final List list) { + if (list == null) { + return new ArrayList<>(); + } + return list .stream() - .map( - p -> new TypedValue() - .setValue(p.getValue()) - .setType(p.getQualifier().getClassid())) + .map(ConversionUtils::oafStructPropToBrokerTypedValue) .collect(Collectors.toList()); } + private static List mappedList(final List list, final Function func) { + if (list == null) { + return new ArrayList<>(); + } + + return list + .stream() + .map(func::apply) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } + + private static List flatMappedList(final List list, final Function> func) { + if (list == null) { + return new ArrayList<>(); + } + + return list + .stream() + .map(func::apply) + .flatMap(List::stream) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } + + private static T mappedFirst(final List list, final Function func) { + if (list == null) { + return null; + } + + return list + .stream() + .map(func::apply) + .filter(Objects::nonNull) + .findFirst() + .orElse(null); + } + private static eu.dnetlib.broker.objects.Author oafAuthorToBrokerAuthor(final Author author) { - return author != null ? new eu.dnetlib.broker.objects.Author() - .setFullname(author.getFullname()) - .setOrcid( - author - .getPid() - .stream() - .filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase("orcid")) - .map(pid -> pid.getValue()) - .findFirst() - .orElse(null)) - : null; + if (author == null) { + return null; + } + + final String pids = author.getPid() != null ? author + .getPid() + .stream() + .filter(pid -> pid != null) + .filter(pid -> pid.getQualifier() != null) + .filter(pid -> pid.getQualifier().getClassid() != null) + .filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase("orcid")) + .map(pid -> pid.getValue()) + .filter(StringUtils::isNotBlank) + .findFirst() + .orElse(null) : null; + + return new eu.dnetlib.broker.objects.Author(author.getFullname(), pids); } private static eu.dnetlib.broker.objects.Journal oafJournalToBrokerJournal(final Journal journal) { - return journal != null ? new eu.dnetlib.broker.objects.Journal() - .setName(journal.getName()) - .setIssn(journal.getIssnPrinted()) - .setEissn(journal.getIssnOnline()) - .setLissn(journal.getIssnLinking()) : null; + if (journal == null) { + return null; + } + + final eu.dnetlib.broker.objects.Journal res = new eu.dnetlib.broker.objects.Journal(); + res.setName(journal.getName()); + res.setIssn(journal.getIssnPrinted()); + res.setEissn(journal.getIssnOnline()); + res.setLissn(journal.getIssnLinking()); + + return res; } private static eu.dnetlib.broker.objects.ExternalReference oafExtRefToBrokerExtRef(final ExternalReference ref) { - return ref != null ? new eu.dnetlib.broker.objects.ExternalReference() - .setRefidentifier(ref.getRefidentifier()) - .setSitename(ref.getSitename()) - .setType(ref.getQualifier().getClassid()) - .setUrl(ref.getUrl()) - : null; + if (ref == null) { + return null; + } + + final eu.dnetlib.broker.objects.ExternalReference res = new eu.dnetlib.broker.objects.ExternalReference(); + res.setRefidentifier(ref.getRefidentifier()); + res.setSitename(ref.getSitename()); + res.setType(classId(ref.getQualifier())); + res.setUrl(ref.getUrl()); + return res; } public static final eu.dnetlib.broker.objects.Project oafProjectToBrokerProject(final Project p) { @@ -167,10 +224,10 @@ public class ConversionUtils { return null; } - final eu.dnetlib.broker.objects.Project res = new eu.dnetlib.broker.objects.Project() - .setTitle(fieldValue(p.getTitle())) - .setAcronym(fieldValue(p.getAcronym())) - .setCode(fieldValue(p.getCode())); + final eu.dnetlib.broker.objects.Project res = new eu.dnetlib.broker.objects.Project(); + res.setTitle(fieldValue(p.getTitle())); + res.setAcronym(fieldValue(p.getAcronym())); + res.setCode(fieldValue(p.getCode())); final String ftree = fieldValue(p.getFundingtree()); if (StringUtils.isNotBlank(ftree)) { @@ -188,12 +245,25 @@ public class ConversionUtils { } public static final eu.dnetlib.broker.objects.Software oafSoftwareToBrokerSoftware(final Software sw) { - return sw != null ? new eu.dnetlib.broker.objects.Software() - .setName(structPropValue(sw.getTitle())) - .setDescription(fieldValue(sw.getDescription())) - .setRepository(fieldValue(sw.getCodeRepositoryUrl())) - .setLandingPage(fieldValue(sw.getDocumentationUrl())) - : null; + if (sw == null) { + return null; + } + + final eu.dnetlib.broker.objects.Software res = new eu.dnetlib.broker.objects.Software(); + res.setName(structPropValue(sw.getTitle())); + res.setDescription(fieldValue(sw.getDescription())); + res.setRepository(fieldValue(sw.getCodeRepositoryUrl())); + res.setLandingPage(fieldValue(sw.getDocumentationUrl())); + + return res; + } + + private static String first(final List list) { + return list != null && list.size() > 0 ? list.get(0) : null; + } + + private static String kvValue(final KeyValue kv) { + return kv != null ? kv.getValue() : null; } private static String fieldValue(final Field f) { @@ -205,6 +275,10 @@ public class ConversionUtils { : null; } + private static String classId(final Qualifier q) { + return q != null ? q.getClassid() : null; + } + private static String structPropValue(final List props) { return props != null ? props.stream().map(StructuredProperty::getValue).filter(StringUtils::isNotBlank).findFirst().orElse(null) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java index fca9cf89e..2c4bda53d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java @@ -122,13 +122,15 @@ public final class UpdateInfo { .orElse(null); ; - final Provenance provenance = new Provenance().setId(provId).setRepositoryName(provRepo).setUrl(provUrl); + final Provenance provenance = new Provenance(provId, provRepo, provUrl); - return new OpenAireEventPayload() - .setPublication(target) - .setHighlight(hl) - .setTrust(trust) - .setProvenance(provenance); + final OpenAireEventPayload res = new OpenAireEventPayload(); + res.setResult(target); + res.setHighlight(hl); + res.setTrust(trust); + res.setProvenance(provenance); + + return res; } } From 4822747313cf4fb6d771e8e3c11d561d6dc0643d Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Fri, 19 Jun 2020 13:53:56 +0200 Subject: [PATCH 03/37] some fixes --- .../broker/oa/GenerateEventsApplication.java | 24 ++-- .../dhp/broker/oa/util/ConversionUtils.java | 105 +++++++++--------- .../aggregators/simple/ResultAggregator.java | 6 +- .../util/aggregators/simple/ResultGroup.java | 15 +-- .../aggregators/withRels/RelatedDataset.java | 21 +++- .../aggregators/withRels/RelatedProject.java | 21 +++- .../withRels/RelatedPublication.java | 21 +++- .../aggregators/withRels/RelatedSoftware.java | 21 +++- 8 files changed, 143 insertions(+), 91 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java index ae313813d..62171ac61 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java @@ -29,8 +29,9 @@ import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultAggregator; import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.OpenaireBrokerResultAggregator; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedEntityFactory; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject; import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.utils.ISLookupClientFactory; @@ -83,9 +84,11 @@ public class GenerateEventsApplication { removeOutputDir(spark, eventsPath); // TODO REMOVE THIS - readPath(spark, graphPath + "/publication", Publication.class) - .filter(r -> r.getDataInfo().getDeletedbyinference()) - .map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OpenaireBrokerResult.class)) + final Dataset projects = readPath(spark, graphPath + "/project", Project.class); + final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) + .cache(); + relatedEntities(projects, rels, RelatedProject.class) .write() .mode(SaveMode.Overwrite) .json(eventsPath); @@ -129,7 +132,7 @@ public class GenerateEventsApplication { (MapFunction, String>) t -> t._2.getTarget(), Encoders.STRING()) .agg(aggr) .map((MapFunction, ResultGroup>) t -> t._2, Encoders.bean(ResultGroup.class)) - .filter(ResultGroup::isValid) + .filter(rg -> rg.getData().size() > 1) .map( (MapFunction) g -> EventFinder.generateEvents(g, dedupConfig), Encoders.bean(EventGroup.class)) @@ -141,15 +144,15 @@ public class GenerateEventsApplication { final String graphPath, final Class sourceClass) { - // final Dataset projects = readPath(spark, graphPath + "/project", Project.class); + final Dataset projects = readPath(spark, graphPath + "/project", Project.class); // final Dataset datasets = readPath( // spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class); // final Dataset softwares = readPath(spark, graphPath + "/software", Software.class); // final Dataset publications = readPath(spark, graphPath + "/publication", Publication.class); - // final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class) - // .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) - // .cache(); + final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) + .cache(); final Dataset r0 = readPath( spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass) @@ -157,8 +160,7 @@ public class GenerateEventsApplication { .map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OpenaireBrokerResult.class)); // TODO UNCOMMENT THIS - // final Dataset r1 = join(r0, rels, relatedEntities(projects, rels, - // RelatedProject.class)); + final Dataset r1 = join(r0, rels, relatedEntities(projects, rels, RelatedProject.class)); // final Dataset r2 = join(r1, rels, relatedEntities(softwares, rels, // RelatedSoftware.class)); // final Dataset r3 = join(r2, rels, relatedEntities(datasets, rels, diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java index d8f9dffbe..730d06519 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java @@ -7,7 +7,6 @@ import java.util.Objects; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; -import org.apache.commons.lang3.tuple.Pair; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.DocumentHelper; @@ -59,10 +58,6 @@ public class ConversionUtils { return sp != null ? new TypedValue(classId(sp.getQualifier()), sp.getValue()) : null; } - public static final Pair oafSubjectToPair(final StructuredProperty sp) { - return sp != null ? Pair.of(classId(sp.getQualifier()), sp.getValue()) : null; - } - public static final eu.dnetlib.broker.objects.Dataset oafDatasetToBrokerDataset(final Dataset d) { if (d == null) { return null; @@ -123,55 +118,6 @@ public class ConversionUtils { return res; } - private static List structPropTypedList(final List list) { - if (list == null) { - return new ArrayList<>(); - } - - return list - .stream() - .map(ConversionUtils::oafStructPropToBrokerTypedValue) - .collect(Collectors.toList()); - } - - private static List mappedList(final List list, final Function func) { - if (list == null) { - return new ArrayList<>(); - } - - return list - .stream() - .map(func::apply) - .filter(Objects::nonNull) - .collect(Collectors.toList()); - } - - private static List flatMappedList(final List list, final Function> func) { - if (list == null) { - return new ArrayList<>(); - } - - return list - .stream() - .map(func::apply) - .flatMap(List::stream) - .filter(Objects::nonNull) - .collect(Collectors.toList()); - } - - private static T mappedFirst(final List list, final Function func) { - if (list == null) { - return null; - } - - return list - .stream() - .map(func::apply) - .filter(Objects::nonNull) - .findFirst() - .orElse(null); - } - private static eu.dnetlib.broker.objects.Author oafAuthorToBrokerAuthor(final Author author) { if (author == null) { return null; @@ -300,4 +246,55 @@ public class ConversionUtils { .collect(Collectors.toList()) : new ArrayList<>(); } + + private static List structPropTypedList(final List list) { + if (list == null) { + return new ArrayList<>(); + } + + return list + .stream() + .map(ConversionUtils::oafStructPropToBrokerTypedValue) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } + + private static List mappedList(final List list, final Function func) { + if (list == null) { + return new ArrayList<>(); + } + + return list + .stream() + .map(func::apply) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } + + private static List flatMappedList(final List list, final Function> func) { + if (list == null) { + return new ArrayList<>(); + } + + return list + .stream() + .map(func::apply) + .flatMap(List::stream) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } + + private static T mappedFirst(final List list, final Function func) { + if (list == null) { + return null; + } + + return list + .stream() + .map(func::apply) + .filter(Objects::nonNull) + .findFirst() + .orElse(null); + } + } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultAggregator.java index 747482198..a46fde445 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultAggregator.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultAggregator.java @@ -23,12 +23,14 @@ public class ResultAggregator extends Aggregator t) { - return group.addElement(t._1); + group.getData().add(t._1); + return group; } @Override public ResultGroup merge(final ResultGroup g1, final ResultGroup g2) { - return g1.addGroup(g2); + g1.getData().addAll(g2.getData()); + return g1; } @Override diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultGroup.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultGroup.java index 4308224a5..3f9dbe8af 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultGroup.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultGroup.java @@ -14,23 +14,14 @@ public class ResultGroup implements Serializable { */ private static final long serialVersionUID = -3360828477088669296L; - private final List data = new ArrayList<>(); + private List data = new ArrayList<>(); public List getData() { return data; } - public ResultGroup addElement(final OpenaireBrokerResult elem) { - data.add(elem); - return this; + public void setData(final List data) { + this.data = data; } - public ResultGroup addGroup(final ResultGroup group) { - data.addAll(group.getData()); - return this; - } - - public boolean isValid() { - return data.size() > 1; - } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java index fcf1b89b1..6a5fb258c 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java @@ -11,9 +11,12 @@ public class RelatedDataset implements Serializable { * */ private static final long serialVersionUID = 774487705184038324L; - private final String source; - private final String relType; - private final Dataset relDataset; + private String source; + private String relType; + private Dataset relDataset; + + public RelatedDataset() { + } public RelatedDataset(final String source, final String relType, final Dataset relDataset) { this.source = source; @@ -25,12 +28,24 @@ public class RelatedDataset implements Serializable { return source; } + public void setSource(final String source) { + this.source = source; + } + public String getRelType() { return relType; } + public void setRelType(final String relType) { + this.relType = relType; + } + public Dataset getRelDataset() { return relDataset; } + public void setRelDataset(final Dataset relDataset) { + this.relDataset = relDataset; + } + } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java index 233041c09..fafec1e19 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java @@ -12,9 +12,12 @@ public class RelatedProject implements Serializable { */ private static final long serialVersionUID = 4941437626549329870L; - private final String source; - private final String relType; - private final Project relProject; + private String source; + private String relType; + private Project relProject; + + public RelatedProject() { + } public RelatedProject(final String source, final String relType, final Project relProject) { this.source = source; @@ -26,12 +29,24 @@ public class RelatedProject implements Serializable { return source; } + public void setSource(final String source) { + this.source = source; + } + public String getRelType() { return relType; } + public void setRelType(final String relType) { + this.relType = relType; + } + public Project getRelProject() { return relProject; } + public void setRelProject(final Project relProject) { + this.relProject = relProject; + } + } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java index 80b92462d..8a31ddf7e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java @@ -12,9 +12,12 @@ public class RelatedPublication implements Serializable { */ private static final long serialVersionUID = 9021609640411395128L; - private final String source; - private final String relType; - private final Publication relPublication; + private String source; + private String relType; + private Publication relPublication; + + public RelatedPublication() { + } public RelatedPublication(final String source, final String relType, final Publication relPublication) { this.source = source; @@ -26,12 +29,24 @@ public class RelatedPublication implements Serializable { return source; } + public void setSource(final String source) { + this.source = source; + } + public String getRelType() { return relType; } + public void setRelType(final String relType) { + this.relType = relType; + } + public Publication getRelPublication() { return relPublication; } + public void setRelPublication(final Publication relPublication) { + this.relPublication = relPublication; + } + } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java index 13f1f4290..319387469 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java @@ -11,9 +11,12 @@ public class RelatedSoftware implements Serializable { * */ private static final long serialVersionUID = 7573383356943300157L; - private final String source; - private final String relType; - private final Software relSoftware; + private String source; + private String relType; + private Software relSoftware; + + public RelatedSoftware() { + } public RelatedSoftware(final String source, final String relType, final Software relSoftware) { this.source = source; @@ -25,12 +28,24 @@ public class RelatedSoftware implements Serializable { return source; } + public void setSource(final String source) { + this.source = source; + } + public String getRelType() { return relType; } + public void setRelType(final String relType) { + this.relType = relType; + } + public Software getRelSoftware() { return relSoftware; } + public void setRelSoftware(final Software relSoftware) { + this.relSoftware = relSoftware; + } + } From 1681de672ddd22f7702b4330af79b34b6200695d Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 19 Jun 2020 15:11:46 +0200 Subject: [PATCH 04/37] updated mapping scholexplorer to OAF --- .../java/eu/dnetlib/dhp/export/DLIToOAF.scala | 137 +++++++++++--- .../SparkExportContentForOpenAire.scala | 174 ++++++++++++------ 2 files changed, 229 insertions(+), 82 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala index 5d7c444b2..637362acf 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala @@ -4,9 +4,12 @@ import java.time.LocalDateTime import java.time.format.DateTimeFormatter import eu.dnetlib.dhp.common.PacePerson -import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Dataset, Field, Instance, KeyValue, Publication, Qualifier, Relation, StructuredProperty} +import eu.dnetlib.dhp.schema.action.AtomicAction +import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, StructuredProperty} import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIRelation} +import eu.dnetlib.dhp.utils.DHPUtils import org.apache.commons.lang3.StringUtils +import org.codehaus.jackson.map.ObjectMapper import scala.collection.JavaConverters._ @@ -77,6 +80,76 @@ object DLIToOAF { ) + val rel_inverse: Map[String, String] = Map( + "isRelatedTo" -> "isRelatedTo", + "IsSupplementedBy" -> "isSupplementTo", + "cites" -> "IsCitedBy", + "IsCitedBy" -> "cites", + "reviews" -> "IsReviewedBy" + ) + + + val PidTypeMap: Map[String, String] = Map( + "pbmid" -> "pmid", + "pmcid" -> "pmc", + "pmid" -> "pmid", + "pubmedid" -> "pmid", + "DOI" -> "doi", + "doi" -> "doi" + ) + + + def toActionSet(item: Oaf): (String, String) = { + val mapper = new ObjectMapper() + + item match { + case dataset: Dataset => + val a: AtomicAction[Dataset] = new AtomicAction[Dataset] + a.setClazz(classOf[Dataset]) + a.setPayload(dataset) + (dataset.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case publication: Publication => + val a: AtomicAction[Publication] = new AtomicAction[Publication] + a.setClazz(classOf[Publication]) + a.setPayload(publication) + (publication.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case relation: Relation => + val a: AtomicAction[Relation] = new AtomicAction[Relation] + a.setClazz(classOf[Relation]) + a.setPayload(relation) + (relation.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case _ => + null + } + } + + def convertClinicalTrial(dataset: DLIDataset): (String, String) = { + val currentId = generateId(dataset.getId) + val pids = dataset.getPid.asScala.filter(p => "clinicaltrials.gov".equalsIgnoreCase(p.getQualifier.getClassname)).map(p => s"50|r3111dacbab5::${DHPUtils.md5(p.getValue.toLowerCase())}") + if (pids.isEmpty) + null + else + (currentId, pids.head) + } + + + def insertExternalRefs(publication: Publication, externalReferences: List[DLIExternalReference]): Publication = { + + val eRefs = externalReferences.map(e => { + val result = new ExternalReference() + result.setSitename(e.sitename) + result.setLabel(e.label) + result.setUrl(e.url) + result.setRefidentifier(e.pid) + result.setDataInfo(generateDataInfo()) + result.setQualifier(createQualifier(e.classId, "dnet:externalReference_typologies")) + result + }) + publication.setExternalReference(eRefs.asJava) + publication + + } + def filterPid(p: StructuredProperty): Boolean = { if (expectecdPidType.contains(p.getQualifier.getClassname) && p.getQualifier.getClassname.equalsIgnoreCase("url")) if (filteredURL.exists(u => p.getValue.contains(u))) @@ -97,7 +170,6 @@ object DLIToOAF { } def convertDLIDatasetToExternalReference(dataset: DLIDataset): DLIExternalReference = { - val currentId = generateId(dataset.getId) val pids = dataset.getPid.asScala.filter(filterPid) if (pids == null || pids.isEmpty) @@ -109,7 +181,7 @@ object DLIToOAF { pid.getQualifier.getClassname match { case "uniprot" => DLIExternalReference(generateId(dataset.getId), s"https://www.uniprot.org/uniprot/${pid.getValue}", "UniProt", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber") case "ena" => - if(pid.getValue!= null && pid.getValue.nonEmpty && pid.getValue.length>7) + if (pid.getValue != null && pid.getValue.nonEmpty && pid.getValue.length > 7) DLIExternalReference(generateId(dataset.getId), s"https://www.ebi.ac.uk/ena/data/view/${pid.getValue.substring(0, 8)}", "European Nucleotide Archive", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber") else null @@ -126,43 +198,50 @@ object DLIToOAF { } - def convertDLIPublicationToOAF(p: DLIPublication): Publication = { - + def convertDLIPublicationToOAF(inputPublication: DLIPublication): Publication = { val result = new Publication - result.setId(generateId(p.getId)) + val cleanedPids = inputPublication.getPid.asScala.filter(p => PidTypeMap.contains(p.getQualifier.getClassid)) + .map(p => { + p.setQualifier(createQualifier(PidTypeMap(p.getQualifier.getClassid), p.getQualifier.getSchemeid)) + p + }) + if (cleanedPids.isEmpty) + return null + result.setId(generateId(inputPublication.getId)) result.setDataInfo(generateDataInfo(invisibile = true)) - if (p.getCollectedfrom == null || p.getCollectedfrom.size() == 0 || (p.getCollectedfrom.size() == 1 && p.getCollectedfrom.get(0) == null)) + if (inputPublication.getCollectedfrom == null || inputPublication.getCollectedfrom.size() == 0 || (inputPublication.getCollectedfrom.size() == 1 && inputPublication.getCollectedfrom.get(0) == null)) return null - - result.setCollectedfrom(p.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).asJava) - result.setPid(p.getPid) - result.setDateofcollection(p.getDateofcollection) - result.setOriginalId(p.getPid.asScala.map(p => p.getValue).asJava) + result.setCollectedfrom(inputPublication.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).filter(p => p != null).asJava) + if(result.getCollectedfrom.isEmpty) + return null + result.setPid(cleanedPids.asJava) + result.setDateofcollection(inputPublication.getDateofcollection) + result.setOriginalId(inputPublication.getPid.asScala.map(p => p.getValue).asJava) result.setDateoftransformation(LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'"))) - if (p.getAuthor == null || p.getAuthor.isEmpty) + if (inputPublication.getAuthor == null || inputPublication.getAuthor.isEmpty) return null - result.setAuthor(p.getAuthor.asScala.map(convertAuthor).asJava) - result.setResulttype(createQualifier(p.getResulttype.getClassid, p.getResulttype.getClassname, "dnet:result_typologies", "dnet:result_typologies")) + result.setAuthor(inputPublication.getAuthor.asScala.map(convertAuthor).asJava) + result.setResulttype(createQualifier(inputPublication.getResulttype.getClassid, inputPublication.getResulttype.getClassname, "dnet:result_typologies", "dnet:result_typologies")) - if (p.getSubject != null) - result.setSubject(p.getSubject.asScala.map(convertSubject).asJava) + if (inputPublication.getSubject != null) + result.setSubject(inputPublication.getSubject.asScala.map(convertSubject).asJava) - if (p.getTitle == null || p.getTitle.isEmpty) + if (inputPublication.getTitle == null || inputPublication.getTitle.isEmpty) return null - result.setTitle(List(patchTitle(p.getTitle.get(0))).asJava) + result.setTitle(List(patchTitle(inputPublication.getTitle.get(0))).asJava) - if (p.getRelevantdate == null || p.getRelevantdate.size() == 0) + if (inputPublication.getRelevantdate == null || inputPublication.getRelevantdate.size() == 0) return null - result.setRelevantdate(p.getRelevantdate.asScala.map(patchRelevantDate).asJava) + result.setRelevantdate(inputPublication.getRelevantdate.asScala.map(patchRelevantDate).asJava) - result.setDescription(p.getDescription) + result.setDescription(inputPublication.getDescription) - result.setDateofacceptance(asField(p.getRelevantdate.get(0).getValue)) - result.setPublisher(p.getPublisher) - result.setSource(p.getSource) + result.setDateofacceptance(asField(inputPublication.getRelevantdate.get(0).getValue)) + result.setPublisher(inputPublication.getPublisher) + result.setSource(inputPublication.getSource) result.setBestaccessright(createQualifier("UNKNOWN", "not available", "dnet:access_modes", "dnet:access_modes")) val dois = result.getPid.asScala.filter(p => "doi".equalsIgnoreCase(p.getQualifier.getClassname)).map(p => p.getValue) @@ -170,7 +249,7 @@ object DLIToOAF { return null - val i: Instance = createInstance(s"https://dx.doi.org/${dois.head}", firstInstanceOrNull(p.getInstance()), result.getDateofacceptance) + val i: Instance = createInstance(s"https://dx.doi.org/${dois.head}", firstInstanceOrNull(inputPublication.getInstance()), result.getDateofacceptance) if (i != null) result.setInstance(List(i).asJava) @@ -211,7 +290,9 @@ object DLIToOAF { val result: Dataset = new Dataset result.setId(generateId(d.getId)) result.setDataInfo(generateDataInfo()) - result.setCollectedfrom(d.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).asJava) + result.setCollectedfrom(d.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).filter(p => p != null).asJava) + if(result.getCollectedfrom.isEmpty) + return null result.setPid(d.getPid) @@ -280,7 +361,7 @@ object DLIToOAF { if (dataset) i.setInstancetype(createQualifier("0021", "Dataset", "dnet:publication_resource", "dnet:publication_resource")) else - i.setInstancetype(createQualifier("0000", "UNKNOWN", "dnet:publication_resource", "dnet:publication_resource")) + i.setInstancetype(createQualifier("0000", "Unknown", "dnet:publication_resource", "dnet:publication_resource")) if (originalInstance != null && originalInstance.getHostedby != null) i.setHostedby(originalInstance.getHostedby) diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala index f3aa35549..edf951df4 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala @@ -4,10 +4,16 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset} import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIRelation} import org.apache.commons.io.IOUtils +import org.apache.hadoop.io.Text +import org.apache.hadoop.io.compress.GzipCodec +import org.apache.hadoop.mapred.SequenceFileOutputFormat import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.expressions.Window import org.apache.spark.{SparkConf, SparkContext} import org.codehaus.jackson.map.ObjectMapper + import scala.collection.mutable.ArrayBuffer @@ -36,57 +42,66 @@ object SparkExportContentForOpenAire { implicit val dliRelEncoder: Encoder[DLIRelation] = Encoders.bean(classOf[DLIRelation]) import spark.implicits._ -// -// val relRDD:RDD[Relation] = sc.textFile(s"$workingPath/relation_j") -// .map(s => new ObjectMapper().readValue(s, classOf[DLIRelation])) -// .filter(p => p.getDataInfo.getDeletedbyinference == false) -// .map(DLIToOAF.convertDLIRelation).filter(p=>p!= null) -// spark.createDataset(relRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS") -// -// val datRDD:RDD[OafDataset] = sc.textFile(s"$workingPath/dataset") -// .map(s => new ObjectMapper().readValue(s, classOf[DLIDataset])) -// .filter(p => p.getDataInfo.getDeletedbyinference == false) -// .map(DLIToOAF.convertDLIDatasetTOOAF).filter(p=>p!= null) -// spark.createDataset(datRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/datasetDS") -// -// -// val pubRDD:RDD[Publication] = sc.textFile(s"$workingPath/publication") -// .map(s => new ObjectMapper().readValue(s, classOf[DLIPublication])) -// .filter(p => p.getDataInfo.getDeletedbyinference == false) -// .map(DLIToOAF.convertDLIPublicationToOAF).filter(p=>p!= null) -// spark.createDataset(pubRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationDS") -// -// -// -// val pubs:Dataset[Publication] = spark.read.load(s"$workingPath/publicationDS").as[Publication] -// val dats :Dataset[OafDataset] = spark.read.load(s"$workingPath/datasetDS").as[OafDataset] - var relDS :Dataset[Relation] = spark.read.load(s"$workingPath/relationDS").as[Relation] -// -// -// pubs.joinWith(relDS, pubs("id").equalTo(relDS("source"))).map(k => k._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS_f1") -// -// relDS= spark.read.load(s"$workingPath/relationDS_f1").as[Relation] -// -// relDS.joinWith(dats, relDS("target").equalTo(dats("id"))).map(k => k._1).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS_filtered") -// -// -// val r_source = relDS.select(relDS("source")).distinct() -// val r_target = relDS.select(relDS("source")).distinct() -// -// -// pubs.joinWith(r_source, pubs("id").equalTo(r_source("source")), "inner").map(k => k._1).write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationDS_filtered") -// -// dats.joinWith(r_target, dats("id").equalTo(r_target("target")), "inner").map(k => k._1).write.mode(SaveMode.Overwrite).save(s"$workingPath/datasetDS_filtered") -// -// spark.createDataset(sc.textFile(s"$workingPath/dataset") -// .map(s => new ObjectMapper().readValue(s, classOf[DLIDataset])) -// .map(DLIToOAF.convertDLIDatasetToExternalReference) -// .filter(p => p != null)).as[DLIExternalReference].write.mode(SaveMode.Overwrite).save(s"$workingPath/externalReference") -// + + val relRDD:RDD[Relation] = sc.textFile(s"$workingPath/relation_j") + .map(s => new ObjectMapper().readValue(s, classOf[DLIRelation])) + .filter(p => p.getDataInfo.getDeletedbyinference == false) + .map(DLIToOAF.convertDLIRelation).filter(p=>p!= null) + spark.createDataset(relRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS") + + val datRDD:RDD[OafDataset] = sc.textFile(s"$workingPath/dataset") + .map(s => new ObjectMapper().readValue(s, classOf[DLIDataset])) + .filter(p => p.getDataInfo.getDeletedbyinference == false) + .map(DLIToOAF.convertDLIDatasetTOOAF).filter(p=>p!= null) + spark.createDataset(datRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/datasetDS") + + + val pubRDD:RDD[Publication] = sc.textFile(s"$workingPath/publication") + .map(s => new ObjectMapper().readValue(s, classOf[DLIPublication])) + .filter(p => p.getDataInfo.getDeletedbyinference == false) + .map(DLIToOAF.convertDLIPublicationToOAF).filter(p=>p!= null) + spark.createDataset(pubRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationDS") + + + + val pubs:Dataset[Publication] = spark.read.load(s"$workingPath/publicationDS").as[Publication] + val dats :Dataset[OafDataset] = spark.read.load(s"$workingPath/datasetDS").as[OafDataset] + val relDS1 :Dataset[Relation] = spark.read.load(s"$workingPath/relationDS").as[Relation] + + + val pub_id = pubs.select("id").distinct() + val dat_id = dats.select("id").distinct() + + + pub_id.joinWith(relDS1, pub_id("id").equalTo(relDS1("source"))).map(k => k._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS_f1") + + val relDS2= spark.read.load(s"$workingPath/relationDS_f1").as[Relation] + + relDS2.joinWith(dat_id, relDS2("target").equalTo(dats("id"))).map(k => k._1).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS_filtered") + + + val r_source = relDS2.select(relDS2("source")).distinct() + val r_target = relDS2.select(relDS2("target")).distinct() + + + val w2 = Window.partitionBy("id").orderBy("lastupdatetimestamp") + + pubs.joinWith(r_source, pubs("id").equalTo(r_source("source")), "inner").map(k => k._1) + .withColumn("row",row_number.over(w2)).where($"row" === 1).drop("row") + .write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationDS_filtered") + + dats.joinWith(r_target, dats("id").equalTo(r_target("target")), "inner").map(k => k._1) + .withColumn("row",row_number.over(w2)).where($"row" === 1).drop("row") + .write.mode(SaveMode.Overwrite).save(s"$workingPath/datasetAS") + + spark.createDataset(sc.textFile(s"$workingPath/dataset") + .map(s => new ObjectMapper().readValue(s, classOf[DLIDataset])) + .map(DLIToOAF.convertDLIDatasetToExternalReference) + .filter(p => p != null)).as[DLIExternalReference].write.mode(SaveMode.Overwrite).save(s"$workingPath/externalReference") val pf = spark.read.load(s"$workingPath/publicationDS_filtered").select("id") - relDS = spark.read.load(s"$workingPath/relationDS").as[Relation] - val relationTo = pf.joinWith(relDS, pf("id").equalTo(relDS("source")),"inner").map(t =>t._2) + val relDS3 = spark.read.load(s"$workingPath/relationDS").as[Relation] + val relationTo = pf.joinWith(relDS3, pf("id").equalTo(relDS3("source")),"inner").map(t =>t._2) val extRef = spark.read.load(s"$workingPath/externalReference").as[DLIExternalReference] @@ -100,19 +115,70 @@ object SparkExportContentForOpenAire { (f._1, dli_ext) })).write.mode(SaveMode.Overwrite).save(s"$workingPath/externalReference_grouped") + val pubf :Dataset[Publication] = spark.read.load(s"$workingPath/publicationDS_filtered").as[Publication] + + val groupedERf:Dataset[(String, List[DLIExternalReference])]= spark.read.load(s"$workingPath/externalReference_grouped").as[(String, List[DLIExternalReference])] + + groupedERf.joinWith(pubf,pubf("id").equalTo(groupedERf("_1"))).map(t => + { + val publication = t._2 + if (t._1 != null) { + val eRefs = t._1._2 + DLIToOAF.insertExternalRefs(publication, eRefs) + + } else + publication + } + ).write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationAS") + spark.createDataset(sc.textFile(s"$workingPath/dataset") + .map(s => new ObjectMapper().readValue(s, classOf[DLIDataset])) + .map(DLIToOAF.convertClinicalTrial) + .filter(p => p != null)) + .write.mode(SaveMode.Overwrite).save(s"$workingPath/clinicalTrials") + + val ct:Dataset[(String,String)] = spark.read.load(s"$workingPath/clinicalTrials").as[(String,String)] + + val relDS= spark.read.load(s"$workingPath/relationDS_f1").as[Relation] + + relDS.joinWith(ct, relDS("target").equalTo(ct("_1")), "inner") + .map(k =>{ + val currentRel = k._1 + currentRel.setTarget(k._2._2) + currentRel + }).write.mode(SaveMode.Overwrite).save(s"$workingPath/clinicalTrialsRels") + val clRels:Dataset[Relation] = spark.read.load(s"$workingPath/clinicalTrialsRels").as[Relation] + val rels:Dataset[Relation] = spark.read.load(s"$workingPath/relationDS_filtered").as[Relation] + + rels.union(clRels).flatMap(r => { + val inverseRel = new Relation + inverseRel.setSource(r.getTarget) + inverseRel.setTarget(r.getSource) + inverseRel.setDataInfo(r.getDataInfo) + inverseRel.setCollectedfrom(r.getCollectedfrom) + inverseRel.setRelType(r.getRelType) + inverseRel.setSubRelType(r.getSubRelType) + inverseRel.setRelClass(DLIToOAF.rel_inverse(r.getRelClass)) + List(r, inverseRel) + }).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationAS") + val fRels:Dataset[(String,String)] = spark.read.load(s"$workingPath/relationAS").as[Relation].map(DLIToOAF.toActionSet) + val fpubs:Dataset[(String,String)] = spark.read.load(s"$workingPath/publicationAS").as[Publication].map(DLIToOAF.toActionSet) + val fdats:Dataset[(String,String)] = spark.read.load(s"$workingPath/datasetAS").as[OafDataset].map(DLIToOAF.toActionSet) - - - - - + fRels.union(fpubs).union(fdats).rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingPath/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec]) } + + + + + + + } From d88fe0ac845602eaa929a7454d36864bd3c88f44 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Fri, 19 Jun 2020 15:24:30 +0200 Subject: [PATCH 05/37] join methods --- .../broker/oa/GenerateEventsApplication.java | 96 +++++++++++++++---- .../withRels/RelatedEntityFactory.java | 34 ------- 2 files changed, 75 insertions(+), 55 deletions(-) delete mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedEntityFactory.java diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java index 62171ac61..f15d918c9 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java @@ -28,12 +28,17 @@ import eu.dnetlib.dhp.broker.oa.util.EventGroup; import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultAggregator; import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.OpenaireBrokerResultAggregator; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedEntityFactory; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDataset; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublication; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware; import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.pace.config.DedupConfig; @@ -84,11 +89,8 @@ public class GenerateEventsApplication { removeOutputDir(spark, eventsPath); // TODO REMOVE THIS - final Dataset projects = readPath(spark, graphPath + "/project", Project.class); - final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class) - .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) - .cache(); - relatedEntities(projects, rels, RelatedProject.class) + + relatedProjects(spark, graphPath) .write() .mode(SaveMode.Overwrite) .json(eventsPath); @@ -144,7 +146,6 @@ public class GenerateEventsApplication { final String graphPath, final Class sourceClass) { - final Dataset projects = readPath(spark, graphPath + "/project", Project.class); // final Dataset datasets = readPath( // spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class); // final Dataset softwares = readPath(spark, graphPath + "/software", Software.class); @@ -160,25 +161,78 @@ public class GenerateEventsApplication { .map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OpenaireBrokerResult.class)); // TODO UNCOMMENT THIS - final Dataset r1 = join(r0, rels, relatedEntities(projects, rels, RelatedProject.class)); - // final Dataset r2 = join(r1, rels, relatedEntities(softwares, rels, - // RelatedSoftware.class)); - // final Dataset r3 = join(r2, rels, relatedEntities(datasets, rels, - // RelatedDataset.class)); - // final Dataset r4 = join(r3, rels, relatedEntities(publications, rels, - // RelatedPublication.class));; + final Dataset r1 = join(r0, rels, relatedProjects(spark, graphPath)); + // final Dataset r2 = join(r1, rels, relatedDataset(spark, graphPath)); + // final Dataset r3 = join(r2, rels, relatedPublications(spark, graphPath)); + // final Dataset r4 = join(r3, rels, relatedSoftwares(spark, graphPath)); - return r0; // TODO it should be r4 + return r1; // TODO it should be r4 } - private static Dataset relatedEntities(final Dataset targets, - final Dataset rels, - final Class clazz) { + private static Dataset relatedProjects(final SparkSession spark, final String graphPath) { + + final Dataset projects = readPath(spark, graphPath + "/project", Project.class); + + final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT)); + return rels - .joinWith(targets, targets.col("id").equalTo(rels.col("target")), "inner") + .joinWith(projects, projects.col("id").equalTo(rels.col("target")), "inner") .map( - t -> RelatedEntityFactory.newRelatedEntity(t._1.getSource(), t._1.getRelType(), t._2, clazz), - Encoders.bean(clazz)); + t -> new RelatedProject( + t._1.getSource(), + t._1.getRelType(), + ConversionUtils.oafProjectToBrokerProject(t._2)), + Encoders.bean(RelatedProject.class)); + } + + private static Dataset relatedDataset(final SparkSession spark, final String graphPath) { + + final Dataset datasets = readPath( + spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class); + + final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class); + + return rels + .joinWith(datasets, datasets.col("id").equalTo(rels.col("target")), "inner") + .map( + t -> new RelatedDataset( + t._1.getSource(), + t._1.getRelType(), + ConversionUtils.oafDatasetToBrokerDataset(t._2)), + Encoders.bean(RelatedDataset.class)); + } + + private static Dataset relatedSoftwares(final SparkSession spark, final String graphPath) { + + final Dataset softwares = readPath(spark, graphPath + "/software", Software.class); + + final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class); + + return rels + .joinWith(softwares, softwares.col("id").equalTo(rels.col("target")), "inner") + .map( + t -> new RelatedSoftware( + t._1.getSource(), + t._1.getRelType(), + ConversionUtils.oafSoftwareToBrokerSoftware(t._2)), + Encoders.bean(RelatedSoftware.class)); + } + + private static Dataset relatedPublications(final SparkSession spark, final String graphPath) { + + final Dataset pubs = readPath(spark, graphPath + "/publication", Publication.class); + + final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class); + + return rels + .joinWith(pubs, pubs.col("id").equalTo(rels.col("target")), "inner") + .map( + t -> new RelatedPublication( + t._1.getSource(), + t._1.getRelType(), + ConversionUtils.oafPublicationToBrokerPublication(t._2)), + Encoders.bean(RelatedPublication.class)); } private static Dataset join(final Dataset sources, diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedEntityFactory.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedEntityFactory.java deleted file mode 100644 index c60d4f141..000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedEntityFactory.java +++ /dev/null @@ -1,34 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; - -import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; -import eu.dnetlib.dhp.schema.oaf.Dataset; -import eu.dnetlib.dhp.schema.oaf.Project; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.schema.oaf.Software; - -public class RelatedEntityFactory { - - @SuppressWarnings("unchecked") - public static RT newRelatedEntity(final String sourceId, - final String relType, - final T target, - final Class clazz) { - - if (clazz == RelatedProject.class) { - return (RT) new RelatedProject(sourceId, relType, - ConversionUtils.oafProjectToBrokerProject((Project) target)); - } else if (clazz == RelatedSoftware.class) { - return (RT) new RelatedSoftware(sourceId, relType, - ConversionUtils.oafSoftwareToBrokerSoftware((Software) target)); - } else if (clazz == RelatedDataset.class) { - return (RT) new RelatedDataset(sourceId, relType, - ConversionUtils.oafDatasetToBrokerDataset((Dataset) target)); - } else if (clazz == RelatedPublication.class) { - return (RT) new RelatedPublication(sourceId, relType, - ConversionUtils.oafPublicationToBrokerPublication((Publication) target)); - } else { - return null; - } - } -} From 16c7a184359a66adc89d2431e2a653653d091ddc Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Mon, 22 Jun 2020 08:51:31 +0200 Subject: [PATCH 06/37] refactoring --- dhp-workflows/dhp-broker-events/pom.xml | 2 +- .../dhp/broker/model/EventFactory.java | 6 +- .../broker/oa/GenerateEventsApplication.java | 143 +++--------------- .../broker/oa/GenerateRelatedDatasets.java | 73 +++++++++ .../broker/oa/GenerateRelatedProjects.java | 79 ++++++++++ .../oa/GenerateRelatedPublications.java | 78 ++++++++++ .../broker/oa/GenerateRelatedSoftwares.java | 76 ++++++++++ .../broker/oa/GenerateSimpleEntitities.java | 85 +++++++++++ .../dhp/broker/oa/matchers/UpdateMatcher.java | 16 +- .../AbstractEnrichMissingDataset.java | 13 +- .../relatedProjects/EnrichMissingProject.java | 8 +- .../relatedProjects/EnrichMoreProject.java | 12 +- .../AbstractEnrichMissingPublication.java | 14 +- .../EnrichMissingSoftware.java | 11 +- .../relatedSoftware/EnrichMoreSoftware.java | 14 +- .../simple/EnrichMissingAbstract.java | 4 +- .../simple/EnrichMissingAuthorOrcid.java | 12 +- .../simple/EnrichMissingOpenAccess.java | 14 +- .../oa/matchers/simple/EnrichMissingPid.java | 10 +- .../simple/EnrichMissingPublicationDate.java | 6 +- .../matchers/simple/EnrichMissingSubject.java | 12 +- .../matchers/simple/EnrichMoreOpenAccess.java | 12 +- .../oa/matchers/simple/EnrichMorePid.java | 12 +- .../oa/matchers/simple/EnrichMoreSubject.java | 12 +- .../dhp/broker/oa/util/ClusterUtils.java | 31 ++++ .../dhp/broker/oa/util/ConversionUtils.java | 60 +++++--- .../dhp/broker/oa/util/EventFinder.java | 4 +- .../dhp/broker/oa/util/UpdateInfo.java | 38 ++--- .../aggregators/simple/ResultAggregator.java | 6 +- .../util/aggregators/simple/ResultGroup.java | 8 +- ...java => OaBrokerMainEntityAggregator.java} | 26 ++-- .../aggregators/withRels/RelatedDataset.java | 10 +- .../aggregators/withRels/RelatedProject.java | 10 +- .../withRels/RelatedPublication.java | 11 +- .../aggregators/withRels/RelatedSoftware.java | 10 +- .../dhp/broker/oa/generate_relations.json | 14 ++ .../broker/oa/generate_simple_entities.json | 14 ++ 37 files changed, 669 insertions(+), 297 deletions(-) create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedDatasets.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedProjects.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedPublications.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedSoftwares.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateSimpleEntitities.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/{OpenaireBrokerResultAggregator.java => OaBrokerMainEntityAggregator.java} (59%) create mode 100644 dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_relations.json create mode 100644 dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_simple_entities.json diff --git a/dhp-workflows/dhp-broker-events/pom.xml b/dhp-workflows/dhp-broker-events/pom.xml index f943ac93a..8d7d3b88c 100644 --- a/dhp-workflows/dhp-broker-events/pom.xml +++ b/dhp-workflows/dhp-broker-events/pom.xml @@ -53,7 +53,7 @@ eu.dnetlib dnet-openaire-broker-common - [3.0.2,4.0.0) + [3.0.3,4.0.0) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java index bf4f62d24..6e38f7448 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java @@ -11,7 +11,7 @@ import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.time.DateUtils; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; public class EventFactory { @@ -49,8 +49,8 @@ public class EventFactory { private static Map createMapFromResult(final UpdateInfo updateInfo) { final Map map = new HashMap<>(); - final OpenaireBrokerResult source = updateInfo.getSource(); - final OpenaireBrokerResult target = updateInfo.getTarget(); + final OaBrokerMainEntity source = updateInfo.getSource(); + final OaBrokerMainEntity target = updateInfo.getTarget(); map.put("target_datasource_id", target.getCollectedFromId()); map.put("target_datasource_name", target.getCollectedFromName()); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java index f15d918c9..db5992010 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java @@ -18,27 +18,20 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.broker.model.Event; import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.EventFinder; import eu.dnetlib.dhp.broker.oa.util.EventGroup; import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultAggregator; import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.OpenaireBrokerResultAggregator; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDataset; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublication; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.OaBrokerMainEntityAggregator; import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; -import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.pace.config.DedupConfig; @@ -48,8 +41,6 @@ public class GenerateEventsApplication { private static final Logger log = LoggerFactory.getLogger(GenerateEventsApplication.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils @@ -86,11 +77,11 @@ public class GenerateEventsApplication { runWithSparkSession(conf, isSparkSessionManaged, spark -> { - removeOutputDir(spark, eventsPath); + ClusterUtils.removeDir(spark, eventsPath); // TODO REMOVE THIS - relatedProjects(spark, graphPath) + expandResultsWithRelations(spark, graphPath, Publication.class) .write() .mode(SaveMode.Overwrite) .json(eventsPath); @@ -110,28 +101,25 @@ public class GenerateEventsApplication { } - private static void removeOutputDir(final SparkSession spark, final String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } - private static Dataset generateEvents( final SparkSession spark, final String graphPath, final Class sourceClass, final DedupConfig dedupConfig) { - final Dataset results = expandResultsWithRelations(spark, graphPath, sourceClass); + final Dataset results = expandResultsWithRelations(spark, graphPath, sourceClass); - final Dataset mergedRels = readPath(spark, graphPath + "/relation", Relation.class) + final Dataset mergedRels = ClusterUtils + .readPath(spark, graphPath + "/relation", Relation.class) .filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)); - final TypedColumn, ResultGroup> aggr = new ResultAggregator() + final TypedColumn, ResultGroup> aggr = new ResultAggregator() .toColumn(); return results .joinWith(mergedRels, results.col("openaireId").equalTo(mergedRels.col("source")), "inner") .groupByKey( - (MapFunction, String>) t -> t._2.getTarget(), Encoders.STRING()) + (MapFunction, String>) t -> t._2.getTarget(), Encoders.STRING()) .agg(aggr) .map((MapFunction, ResultGroup>) t -> t._2, Encoders.bean(ResultGroup.class)) .filter(rg -> rg.getData().size() > 1) @@ -141,7 +129,7 @@ public class GenerateEventsApplication { .flatMap(group -> group.getData().iterator(), Encoders.bean(Event.class)); } - private static Dataset expandResultsWithRelations( + private static Dataset expandResultsWithRelations( final SparkSession spark, final String graphPath, final Class sourceClass) { @@ -151,116 +139,35 @@ public class GenerateEventsApplication { // final Dataset softwares = readPath(spark, graphPath + "/software", Software.class); // final Dataset publications = readPath(spark, graphPath + "/publication", Publication.class); - final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class) - .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) - .cache(); - - final Dataset r0 = readPath( - spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass) - .filter(r -> r.getDataInfo().getDeletedbyinference()) - .map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OpenaireBrokerResult.class)); + final Dataset r0 = ClusterUtils + .readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass) + .filter(r -> r.getDataInfo().getDeletedbyinference()) + .map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OaBrokerMainEntity.class)); // TODO UNCOMMENT THIS - final Dataset r1 = join(r0, rels, relatedProjects(spark, graphPath)); - // final Dataset r2 = join(r1, rels, relatedDataset(spark, graphPath)); - // final Dataset r3 = join(r2, rels, relatedPublications(spark, graphPath)); - // final Dataset r4 = join(r3, rels, relatedSoftwares(spark, graphPath)); + // final Dataset r1 = join(r0, relatedProjects(spark, graphPath)); + // final Dataset r2 = join(r1, relatedDataset(spark, graphPath)); + // final Dataset r3 = join(r2, relatedPublications(spark, graphPath)); + // final Dataset r4 = join(r3, relatedSoftwares(spark, graphPath)); - return r1; // TODO it should be r4 + return r0; // TODO it should be r4 } - private static Dataset relatedProjects(final SparkSession spark, final String graphPath) { - - final Dataset projects = readPath(spark, graphPath + "/project", Project.class); - - final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class) - .filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT)); - - return rels - .joinWith(projects, projects.col("id").equalTo(rels.col("target")), "inner") - .map( - t -> new RelatedProject( - t._1.getSource(), - t._1.getRelType(), - ConversionUtils.oafProjectToBrokerProject(t._2)), - Encoders.bean(RelatedProject.class)); - } - - private static Dataset relatedDataset(final SparkSession spark, final String graphPath) { - - final Dataset datasets = readPath( - spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class); - - final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class); - - return rels - .joinWith(datasets, datasets.col("id").equalTo(rels.col("target")), "inner") - .map( - t -> new RelatedDataset( - t._1.getSource(), - t._1.getRelType(), - ConversionUtils.oafDatasetToBrokerDataset(t._2)), - Encoders.bean(RelatedDataset.class)); - } - - private static Dataset relatedSoftwares(final SparkSession spark, final String graphPath) { - - final Dataset softwares = readPath(spark, graphPath + "/software", Software.class); - - final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class); - - return rels - .joinWith(softwares, softwares.col("id").equalTo(rels.col("target")), "inner") - .map( - t -> new RelatedSoftware( - t._1.getSource(), - t._1.getRelType(), - ConversionUtils.oafSoftwareToBrokerSoftware(t._2)), - Encoders.bean(RelatedSoftware.class)); - } - - private static Dataset relatedPublications(final SparkSession spark, final String graphPath) { - - final Dataset pubs = readPath(spark, graphPath + "/publication", Publication.class); - - final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class); - - return rels - .joinWith(pubs, pubs.col("id").equalTo(rels.col("target")), "inner") - .map( - t -> new RelatedPublication( - t._1.getSource(), - t._1.getRelType(), - ConversionUtils.oafPublicationToBrokerPublication(t._2)), - Encoders.bean(RelatedPublication.class)); - } - - private static Dataset join(final Dataset sources, - final Dataset rels, + private static Dataset join(final Dataset sources, final Dataset typedRels) { - final TypedColumn, OpenaireBrokerResult> aggr = new OpenaireBrokerResultAggregator() + final TypedColumn, OaBrokerMainEntity> aggr = new OaBrokerMainEntityAggregator() .toColumn(); return sources - .joinWith(typedRels, sources.col("openaireId").equalTo(rels.col("source")), "left_outer") + .joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer") .groupByKey( - (MapFunction, String>) t -> t._1.getOpenaireId(), Encoders.STRING()) + (MapFunction, String>) t -> t._1.getOpenaireId(), Encoders.STRING()) .agg(aggr) - .map(t -> t._2, Encoders.bean(OpenaireBrokerResult.class)); + .map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class)); } - public static Dataset readPath( - final SparkSession spark, - final String inputPath, - final Class clazz) { - return spark - .read() - .textFile(inputPath) - .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); - } - private static DedupConfig loadDedupConfig(final String isLookupUrl, final String profId) throws Exception { final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedDatasets.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedDatasets.java new file mode 100644 index 000000000..4a10fbabf --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedDatasets.java @@ -0,0 +1,73 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDataset; +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class GenerateRelatedDatasets { + + private static final Logger log = LoggerFactory.getLogger(GenerateRelatedDatasets.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GenerateRelatedDatasets.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_relations.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String graphPath = parser.get("graphPath"); + log.info("graphPath: {}", graphPath); + + final String relsPath = parser.get("relsPath"); + log.info("relsPath: {}", relsPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, relsPath); + + final Dataset datasets = ClusterUtils + .readPath(spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class); + + final Dataset rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class); + + rels + .joinWith(datasets, datasets.col("id").equalTo(rels.col("target")), "inner") + .map( + t -> new RelatedDataset( + t._1.getSource(), + t._1.getRelType(), + ConversionUtils.oafDatasetToBrokerDataset(t._2)), + Encoders.bean(RelatedDataset.class)) + .write() + .mode(SaveMode.Overwrite) + .json(relsPath); + + }); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedProjects.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedProjects.java new file mode 100644 index 000000000..59ed388e7 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedProjects.java @@ -0,0 +1,79 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class GenerateRelatedProjects { + + private static final Logger log = LoggerFactory.getLogger(GenerateRelatedProjects.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GenerateRelatedProjects.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_relations.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String graphPath = parser.get("graphPath"); + log.info("graphPath: {}", graphPath); + + final String relsPath = parser.get("relsPath"); + log.info("relsPath: {}", relsPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, relsPath); + + final Dataset projects = ClusterUtils.readPath(spark, graphPath + "/project", Project.class); + + final Dataset rels = ClusterUtils + .readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT)); + + rels + .joinWith(projects, projects.col("id").equalTo(rels.col("target")), "inner") + .map( + t -> new RelatedProject( + t._1.getSource(), + t._1.getRelType(), + ConversionUtils.oafProjectToBrokerProject(t._2)), + Encoders.bean(RelatedProject.class)) + .write() + .mode(SaveMode.Overwrite) + .json(relsPath); + }); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedPublications.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedPublications.java new file mode 100644 index 000000000..0c20081dc --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedPublications.java @@ -0,0 +1,78 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublication; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class GenerateRelatedPublications { + + private static final Logger log = LoggerFactory.getLogger(GenerateRelatedPublications.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GenerateRelatedPublications.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_relations.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String graphPath = parser.get("graphPath"); + log.info("graphPath: {}", graphPath); + + final String relsPath = parser.get("relsPath"); + log.info("relsPath: {}", relsPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, relsPath); + + final Dataset pubs = ClusterUtils + .readPath(spark, graphPath + "/publication", Publication.class); + + final Dataset rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class); + + rels + .joinWith(pubs, pubs.col("id").equalTo(rels.col("target")), "inner") + .map( + t -> new RelatedPublication( + t._1.getSource(), + t._1.getRelType(), + ConversionUtils.oafPublicationToBrokerPublication(t._2)), + Encoders.bean(RelatedPublication.class)) + .write() + .mode(SaveMode.Overwrite) + .json(relsPath); + + }); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedSoftwares.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedSoftwares.java new file mode 100644 index 000000000..b95788846 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedSoftwares.java @@ -0,0 +1,76 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Software; + +public class GenerateRelatedSoftwares { + + private static final Logger log = LoggerFactory.getLogger(GenerateRelatedSoftwares.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GenerateRelatedSoftwares.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_relations.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String graphPath = parser.get("graphPath"); + log.info("graphPath: {}", graphPath); + + final String relsPath = parser.get("relsPath"); + log.info("relsPath: {}", relsPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, relsPath); + final Dataset softwares = ClusterUtils.readPath(spark, graphPath + "/software", Software.class); + + final Dataset rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class); + + rels + .joinWith(softwares, softwares.col("id").equalTo(rels.col("target")), "inner") + .map( + t -> new RelatedSoftware( + t._1.getSource(), + t._1.getRelType(), + ConversionUtils.oafSoftwareToBrokerSoftware(t._2)), + Encoders.bean(RelatedSoftware.class)) + .write() + .mode(SaveMode.Overwrite) + .json(relsPath); + + }); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateSimpleEntitities.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateSimpleEntitities.java new file mode 100644 index 000000000..59485d5cf --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateSimpleEntitities.java @@ -0,0 +1,85 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class GenerateSimpleEntitities { + + private static final Logger log = LoggerFactory.getLogger(GenerateSimpleEntitities.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GenerateSimpleEntitities.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_simple_entities.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String graphPath = parser.get("graphPath"); + log.info("graphPath: {}", graphPath); + + final String simpleEntitiesPath = parser.get("simpleEntitiesPath"); + log.info("simpleEntitiesPath: {}", simpleEntitiesPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, simpleEntitiesPath); + + expandResultsWithRelations(spark, graphPath, Publication.class) + .write() + .mode(SaveMode.Overwrite) + .json(simpleEntitiesPath); + + // TODO UNCOMMENT THIS + // spark + // .emptyDataset(Encoders.bean(Event.class)) + // .union(generateEvents(spark, graphPath, Publication.class, dedupConfig)) + // .union(generateEvents(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class, dedupConfig)) + // .union(generateEvents(spark, graphPath, Software.class, dedupConfig)) + // .union(generateEvents(spark, graphPath, OtherResearchProduct.class, dedupConfig)) + // .write() + // .mode(SaveMode.Overwrite) + // .option("compression", "gzip") + // .json(eventsPath); + }); + + } + + private static Dataset expandResultsWithRelations( + final SparkSession spark, + final String graphPath, + final Class sourceClass) { + + return ClusterUtils + .readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass) + .filter(r -> r.getDataInfo().getDeletedbyinference()) + .map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OaBrokerMainEntity.class)); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java index 13aeefb2f..9aa6f5384 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java @@ -12,7 +12,7 @@ import java.util.function.Function; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.pace.config.DedupConfig; @@ -21,11 +21,11 @@ public abstract class UpdateMatcher { private final boolean multipleUpdate; private final Function topicFunction; - private final BiConsumer compileHighlightFunction; + private final BiConsumer compileHighlightFunction; private final Function highlightToStringFunction; public UpdateMatcher(final boolean multipleUpdate, final Function topicFunction, - final BiConsumer compileHighlightFunction, + final BiConsumer compileHighlightFunction, final Function highlightToStringFunction) { this.multipleUpdate = multipleUpdate; this.topicFunction = topicFunction; @@ -33,13 +33,13 @@ public abstract class UpdateMatcher { this.highlightToStringFunction = highlightToStringFunction; } - public Collection> searchUpdatesForRecord(final OpenaireBrokerResult res, - final Collection others, + public Collection> searchUpdatesForRecord(final OaBrokerMainEntity res, + final Collection others, final DedupConfig dedupConfig) { final Map> infoMap = new HashMap<>(); - for (final OpenaireBrokerResult source : others) { + for (final OaBrokerMainEntity source : others) { if (source != res) { for (final T hl : findDifferences(source, res)) { final Topic topic = getTopicFunction().apply(hl); @@ -68,7 +68,7 @@ public abstract class UpdateMatcher { } } - protected abstract List findDifferences(OpenaireBrokerResult source, OpenaireBrokerResult target); + protected abstract List findDifferences(OaBrokerMainEntity source, OaBrokerMainEntity target); protected static boolean isMissing(final List list) { return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0)); @@ -86,7 +86,7 @@ public abstract class UpdateMatcher { return topicFunction; } - public BiConsumer getCompileHighlightFunction() { + public BiConsumer getCompileHighlightFunction() { return compileHighlightFunction; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java index 7a58f986b..c197734a3 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java @@ -5,13 +5,12 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.Dataset; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerRelatedDataset; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public abstract class AbstractEnrichMissingDataset - extends UpdateMatcher { +public abstract class AbstractEnrichMissingDataset extends UpdateMatcher { public AbstractEnrichMissingDataset(final Topic topic) { super(true, @@ -23,14 +22,14 @@ public abstract class AbstractEnrichMissingDataset protected abstract boolean filterByType(String relType); @Override - protected final List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected final List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingDatasets = target .getDatasets() .stream() .filter(rel -> filterByType(rel.getRelType())) - .map(Dataset::getOriginalId) + .map(OaBrokerRelatedDataset::getOriginalId) .collect(Collectors.toSet()); return source diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java index fa5fde725..49c546bba 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java @@ -4,12 +4,12 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedProjects; import java.util.ArrayList; import java.util.List; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.Project; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerProject; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMissingProject extends UpdateMatcher { +public class EnrichMissingProject extends UpdateMatcher { public EnrichMissingProject() { super(true, @@ -19,7 +19,7 @@ public class EnrichMissingProject extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) { if (target.getProjects().isEmpty()) { return source.getProjects(); } else { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java index ca63aeb49..6954a3fb5 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java @@ -5,12 +5,12 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.Project; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerProject; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMoreProject extends UpdateMatcher { +public class EnrichMoreProject extends UpdateMatcher { public EnrichMoreProject() { super(true, @@ -19,13 +19,13 @@ public class EnrichMoreProject extends UpdateMatcher { prj -> projectAsString(prj)); } - private static String projectAsString(final Project prj) { + private static String projectAsString(final OaBrokerProject prj) { return prj.getFunder() + "::" + prj.getFundingProgram() + "::" + prj.getCode(); } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingProjects = target .getProjects() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java index 300863949..ad6d8263b 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java @@ -5,12 +5,12 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.Publication; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerRelatedPublication; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public abstract class AbstractEnrichMissingPublication extends UpdateMatcher { +public abstract class AbstractEnrichMissingPublication extends UpdateMatcher { public AbstractEnrichMissingPublication(final Topic topic) { super(true, @@ -23,15 +23,15 @@ public abstract class AbstractEnrichMissingPublication extends UpdateMatcher findDifferences( - final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected final List findDifferences( + final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingPublications = target .getPublications() .stream() .filter(rel -> filterByType(rel.getRelType())) - .map(Publication::getOriginalId) + .map(OaBrokerRelatedPublication::getOriginalId) .collect(Collectors.toSet()); return source diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java index 76ae061e6..452caa503 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java @@ -4,12 +4,13 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware; import java.util.ArrayList; import java.util.List; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; public class EnrichMissingSoftware - extends UpdateMatcher { + extends UpdateMatcher { public EnrichMissingSoftware() { super(true, @@ -19,9 +20,9 @@ public class EnrichMissingSoftware } @Override - protected List findDifferences( - final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences( + final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { if (target.getSoftwares().isEmpty()) { return source.getSoftwares(); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java index ebd421b8e..aaffe1249 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java @@ -5,12 +5,12 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.Software; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMoreSoftware extends UpdateMatcher { +public class EnrichMoreSoftware extends UpdateMatcher { public EnrichMoreSoftware() { super(true, @@ -20,14 +20,14 @@ public class EnrichMoreSoftware extends UpdateMatcher { } @Override - protected List findDifferences( - final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences( + final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingSoftwares = source .getSoftwares() .stream() - .map(Software::getName) + .map(OaBrokerRelatedSoftware::getName) .collect(Collectors.toSet()); return target diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java index b2cbbce2c..73462bae8 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java @@ -5,7 +5,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; @@ -19,7 +19,7 @@ public class EnrichMissingAbstract extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) { if (isMissing(target.getAbstracts()) && !isMissing(source.getAbstracts())) { return Arrays.asList(source.getAbstracts().get(0)); } else { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java index c4b96e67b..2a01188a9 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java @@ -7,12 +7,12 @@ import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; -import eu.dnetlib.broker.objects.Author; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerAuthor; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMissingAuthorOrcid extends UpdateMatcher { +public class EnrichMissingAuthorOrcid extends UpdateMatcher { public EnrichMissingAuthorOrcid() { super(true, @@ -22,13 +22,13 @@ public class EnrichMissingAuthorOrcid extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingOrcids = target .getCreators() .stream() - .map(Author::getOrcid) + .map(OaBrokerAuthor::getOrcid) .filter(StringUtils::isNotBlank) .collect(Collectors.toSet()); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java index e870cf1fa..487382957 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java @@ -5,28 +5,28 @@ import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.Instance; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerInstance; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; -public class EnrichMissingOpenAccess extends UpdateMatcher { +public class EnrichMissingOpenAccess extends UpdateMatcher { public EnrichMissingOpenAccess() { super(true, i -> Topic.ENRICH_MISSING_OA_VERSION, (p, i) -> p.getInstances().add(i), - Instance::getUrl); + OaBrokerInstance::getUrl); } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final long count = target .getInstances() .stream() - .map(Instance::getLicense) + .map(OaBrokerInstance::getLicense) .filter(right -> right.equals(BrokerConstants.OPEN_ACCESS)) .count(); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java index cc72d9fa9..ee1617b1e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java @@ -5,12 +5,12 @@ import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.TypedValue; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerTypedValue; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMissingPid extends UpdateMatcher { +public class EnrichMissingPid extends UpdateMatcher { public EnrichMissingPid() { super(true, @@ -20,8 +20,8 @@ public class EnrichMissingPid extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final long count = target.getPids().size(); if (count > 0) { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java index ed8c26b5a..2c0533fa3 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java @@ -5,7 +5,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; @@ -19,8 +19,8 @@ public class EnrichMissingPublicationDate extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { if (isMissing(target.getPublicationdate()) && !isMissing(source.getPublicationdate())) { return Arrays.asList(source.getPublicationdate()); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java index 07b1fa41a..9ab9fce48 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java @@ -5,12 +5,12 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.TypedValue; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerTypedValue; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMissingSubject extends UpdateMatcher { +public class EnrichMissingSubject extends UpdateMatcher { public EnrichMissingSubject() { super(true, @@ -20,8 +20,8 @@ public class EnrichMissingSubject extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingSubject = target .getSubjects() .stream() @@ -35,7 +35,7 @@ public class EnrichMissingSubject extends UpdateMatcher { .collect(Collectors.toList()); } - private static String subjectAsString(final TypedValue s) { + private static String subjectAsString(final OaBrokerTypedValue s) { return s.getType() + "::" + s.getValue(); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java index bfef3ee4f..e90a8f201 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java @@ -5,24 +5,24 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.Instance; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerInstance; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; -public class EnrichMoreOpenAccess extends UpdateMatcher { +public class EnrichMoreOpenAccess extends UpdateMatcher { public EnrichMoreOpenAccess() { super(true, i -> Topic.ENRICH_MORE_OA_VERSION, (p, i) -> p.getInstances().add(i), - Instance::getUrl); + OaBrokerInstance::getUrl); } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set urls = target .getInstances() .stream() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java index d1f2e6022..43b4f0628 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java @@ -5,12 +5,12 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.TypedValue; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerTypedValue; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMorePid extends UpdateMatcher { +public class EnrichMorePid extends UpdateMatcher { public EnrichMorePid() { super(true, @@ -20,8 +20,8 @@ public class EnrichMorePid extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingPids = target .getPids() .stream() @@ -35,7 +35,7 @@ public class EnrichMorePid extends UpdateMatcher { .collect(Collectors.toList()); } - private static String pidAsString(final TypedValue pid) { + private static String pidAsString(final OaBrokerTypedValue pid) { return pid.getType() + "::" + pid.getValue(); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java index 39225e8ab..04fb494ef 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java @@ -5,12 +5,12 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.TypedValue; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerTypedValue; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMoreSubject extends UpdateMatcher { +public class EnrichMoreSubject extends UpdateMatcher { public EnrichMoreSubject() { super(true, @@ -20,8 +20,8 @@ public class EnrichMoreSubject extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingSubjects = target .getSubjects() .stream() @@ -35,7 +35,7 @@ public class EnrichMoreSubject extends UpdateMatcher { .collect(Collectors.toList()); } - private static String subjectAsString(final TypedValue s) { + private static String subjectAsString(final OaBrokerTypedValue s) { return s.getType() + "::" + s.getValue(); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java new file mode 100644 index 000000000..8bcea5e6e --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java @@ -0,0 +1,31 @@ + +package eu.dnetlib.dhp.broker.oa.util; + +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.common.HdfsSupport; + +public class ClusterUtils { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void removeDir(final SparkSession spark, final String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } + + public static Dataset readPath( + final SparkSession spark, + final String inputPath, + final Class clazz) { + return spark + .read() + .textFile(inputPath) + .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java index 730d06519..b61d5e7cc 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java @@ -15,8 +15,16 @@ import org.slf4j.LoggerFactory; import com.google.common.base.Function; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.TypedValue; +import eu.dnetlib.broker.objects.OaBrokerAuthor; +import eu.dnetlib.broker.objects.OaBrokerExternalReference; +import eu.dnetlib.broker.objects.OaBrokerInstance; +import eu.dnetlib.broker.objects.OaBrokerJournal; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerProject; +import eu.dnetlib.broker.objects.OaBrokerRelatedDataset; +import eu.dnetlib.broker.objects.OaBrokerRelatedPublication; +import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware; +import eu.dnetlib.broker.objects.OaBrokerTypedValue; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.ExternalReference; @@ -35,13 +43,13 @@ public class ConversionUtils { private static final Logger log = LoggerFactory.getLogger(ConversionUtils.class); - public static List oafInstanceToBrokerInstances(final Instance i) { + public static List oafInstanceToBrokerInstances(final Instance i) { if (i == null) { return new ArrayList<>(); } return mappedList(i.getUrl(), url -> { - final eu.dnetlib.broker.objects.Instance res = new eu.dnetlib.broker.objects.Instance(); + final OaBrokerInstance res = new OaBrokerInstance(); res.setUrl(url); res.setInstancetype(classId(i.getInstancetype())); res.setLicense(BrokerConstants.OPEN_ACCESS); @@ -50,20 +58,21 @@ public class ConversionUtils { }); } - public static TypedValue oafPidToBrokerPid(final StructuredProperty sp) { + public static OaBrokerTypedValue oafPidToBrokerPid(final StructuredProperty sp) { return oafStructPropToBrokerTypedValue(sp); } - public static TypedValue oafStructPropToBrokerTypedValue(final StructuredProperty sp) { - return sp != null ? new TypedValue(classId(sp.getQualifier()), sp.getValue()) : null; + public static OaBrokerTypedValue oafStructPropToBrokerTypedValue(final StructuredProperty sp) { + return sp != null ? new OaBrokerTypedValue(classId(sp.getQualifier()), sp.getValue()) : null; } - public static final eu.dnetlib.broker.objects.Dataset oafDatasetToBrokerDataset(final Dataset d) { + public static final OaBrokerRelatedDataset oafDatasetToBrokerDataset(final Dataset d) { if (d == null) { return null; } - final eu.dnetlib.broker.objects.Dataset res = new eu.dnetlib.broker.objects.Dataset(); + final OaBrokerRelatedDataset res = new OaBrokerRelatedDataset(); + res.setOpenaireId(d.getId()); res.setOriginalId(first(d.getOriginalId())); res.setTitle(structPropValue(d.getTitle())); res.setPids(mappedList(d.getPid(), ConversionUtils::oafPidToBrokerPid)); @@ -72,12 +81,13 @@ public class ConversionUtils { return res; } - public static eu.dnetlib.broker.objects.Publication oafPublicationToBrokerPublication(final Publication p) { + public static OaBrokerRelatedPublication oafPublicationToBrokerPublication(final Publication p) { if (p == null) { return null; } - final eu.dnetlib.broker.objects.Publication res = new eu.dnetlib.broker.objects.Publication(); + final OaBrokerRelatedPublication res = new OaBrokerRelatedPublication(); + res.setOpenaireId(p.getId()); res.setOriginalId(first(p.getOriginalId())); res.setTitle(structPropValue(p.getTitle())); res.setPids(mappedList(p.getPid(), ConversionUtils::oafPidToBrokerPid)); @@ -87,12 +97,12 @@ public class ConversionUtils { return res; } - public static final OpenaireBrokerResult oafResultToBrokerResult(final Result result) { + public static final OaBrokerMainEntity oafResultToBrokerResult(final Result result) { if (result == null) { return null; } - final OpenaireBrokerResult res = new OpenaireBrokerResult(); + final OaBrokerMainEntity res = new OaBrokerMainEntity(); res.setOpenaireId(result.getId()); res.setOriginalId(first(result.getOriginalId())); @@ -118,7 +128,7 @@ public class ConversionUtils { return res; } - private static eu.dnetlib.broker.objects.Author oafAuthorToBrokerAuthor(final Author author) { + private static OaBrokerAuthor oafAuthorToBrokerAuthor(final Author author) { if (author == null) { return null; } @@ -135,15 +145,15 @@ public class ConversionUtils { .findFirst() .orElse(null) : null; - return new eu.dnetlib.broker.objects.Author(author.getFullname(), pids); + return new OaBrokerAuthor(author.getFullname(), pids); } - private static eu.dnetlib.broker.objects.Journal oafJournalToBrokerJournal(final Journal journal) { + private static OaBrokerJournal oafJournalToBrokerJournal(final Journal journal) { if (journal == null) { return null; } - final eu.dnetlib.broker.objects.Journal res = new eu.dnetlib.broker.objects.Journal(); + final OaBrokerJournal res = new OaBrokerJournal(); res.setName(journal.getName()); res.setIssn(journal.getIssnPrinted()); res.setEissn(journal.getIssnOnline()); @@ -152,12 +162,12 @@ public class ConversionUtils { return res; } - private static eu.dnetlib.broker.objects.ExternalReference oafExtRefToBrokerExtRef(final ExternalReference ref) { + private static OaBrokerExternalReference oafExtRefToBrokerExtRef(final ExternalReference ref) { if (ref == null) { return null; } - final eu.dnetlib.broker.objects.ExternalReference res = new eu.dnetlib.broker.objects.ExternalReference(); + final OaBrokerExternalReference res = new OaBrokerExternalReference(); res.setRefidentifier(ref.getRefidentifier()); res.setSitename(ref.getSitename()); res.setType(classId(ref.getQualifier())); @@ -165,12 +175,13 @@ public class ConversionUtils { return res; } - public static final eu.dnetlib.broker.objects.Project oafProjectToBrokerProject(final Project p) { + public static final OaBrokerProject oafProjectToBrokerProject(final Project p) { if (p == null) { return null; } - final eu.dnetlib.broker.objects.Project res = new eu.dnetlib.broker.objects.Project(); + final OaBrokerProject res = new OaBrokerProject(); + res.setOpenaireId(p.getId()); res.setTitle(fieldValue(p.getTitle())); res.setAcronym(fieldValue(p.getAcronym())); res.setCode(fieldValue(p.getCode())); @@ -190,12 +201,13 @@ public class ConversionUtils { return res; } - public static final eu.dnetlib.broker.objects.Software oafSoftwareToBrokerSoftware(final Software sw) { + public static final OaBrokerRelatedSoftware oafSoftwareToBrokerSoftware(final Software sw) { if (sw == null) { return null; } - final eu.dnetlib.broker.objects.Software res = new eu.dnetlib.broker.objects.Software(); + final OaBrokerRelatedSoftware res = new OaBrokerRelatedSoftware(); + res.setOpenaireId(sw.getId()); res.setName(structPropValue(sw.getTitle())); res.setDescription(fieldValue(sw.getDescription())); res.setRepository(fieldValue(sw.getCodeRepositoryUrl())); @@ -247,7 +259,7 @@ public class ConversionUtils { : new ArrayList<>(); } - private static List structPropTypedList(final List list) { + private static List structPropTypedList(final List list) { if (list == null) { return new ArrayList<>(); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java index 4c20ac5ca..7451e5891 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java @@ -4,7 +4,7 @@ package eu.dnetlib.dhp.broker.oa.util; import java.util.ArrayList; import java.util.List; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.EventFactory; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsReferencedBy; @@ -68,7 +68,7 @@ public class EventFinder { public static EventGroup generateEvents(final ResultGroup results, final DedupConfig dedupConfig) { final List> list = new ArrayList<>(); - for (final OpenaireBrokerResult target : results.getData()) { + for (final OaBrokerMainEntity target : results.getData()) { for (final UpdateMatcher matcher : matchers) { list.addAll(matcher.searchUpdatesForRecord(target, results.getData(), dedupConfig)); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java index 2c4bda53d..25d0d2bca 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java @@ -9,10 +9,10 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.broker.objects.Instance; -import eu.dnetlib.broker.objects.OpenAireEventPayload; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.Provenance; +import eu.dnetlib.broker.objects.OaBrokerEventPayload; +import eu.dnetlib.broker.objects.OaBrokerInstance; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerProvenance; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; @@ -25,11 +25,11 @@ public final class UpdateInfo { private final T highlightValue; - private final OpenaireBrokerResult source; + private final OaBrokerMainEntity source; - private final OpenaireBrokerResult target; + private final OaBrokerMainEntity target; - private final BiConsumer compileHighlight; + private final BiConsumer compileHighlight; private final Function highlightToString; @@ -37,9 +37,9 @@ public final class UpdateInfo { private static final Logger log = LoggerFactory.getLogger(UpdateInfo.class); - public UpdateInfo(final Topic topic, final T highlightValue, final OpenaireBrokerResult source, - final OpenaireBrokerResult target, - final BiConsumer compileHighlight, + public UpdateInfo(final Topic topic, final T highlightValue, final OaBrokerMainEntity source, + final OaBrokerMainEntity target, + final BiConsumer compileHighlight, final Function highlightToString, final DedupConfig dedupConfig) { this.topic = topic; @@ -55,17 +55,17 @@ public final class UpdateInfo { return highlightValue; } - public OpenaireBrokerResult getSource() { + public OaBrokerMainEntity getSource() { return source; } - public OpenaireBrokerResult getTarget() { + public OaBrokerMainEntity getTarget() { return target; } private float calculateTrust(final DedupConfig dedupConfig, - final OpenaireBrokerResult r1, - final OpenaireBrokerResult r2) { + final OaBrokerMainEntity r1, + final OaBrokerMainEntity r2) { if (dedupConfig == null) { return BrokerConstants.MIN_TRUST; @@ -104,11 +104,11 @@ public final class UpdateInfo { return highlightToString.apply(getHighlightValue()); } - public OpenAireEventPayload asBrokerPayload() { + public OaBrokerEventPayload asBrokerPayload() { compileHighlight.accept(target, getHighlightValue()); - final OpenaireBrokerResult hl = new OpenaireBrokerResult(); + final OaBrokerMainEntity hl = new OaBrokerMainEntity(); compileHighlight.accept(hl, getHighlightValue()); final String provId = getSource().getOriginalId(); @@ -117,14 +117,14 @@ public final class UpdateInfo { final String provUrl = getSource() .getInstances() .stream() - .map(Instance::getUrl) + .map(OaBrokerInstance::getUrl) .findFirst() .orElse(null); ; - final Provenance provenance = new Provenance(provId, provRepo, provUrl); + final OaBrokerProvenance provenance = new OaBrokerProvenance(provId, provRepo, provUrl); - final OpenAireEventPayload res = new OpenAireEventPayload(); + final OaBrokerEventPayload res = new OaBrokerEventPayload(); res.setResult(target); res.setHighlight(hl); res.setTrust(trust); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultAggregator.java index a46fde445..ee1c8963e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultAggregator.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultAggregator.java @@ -5,11 +5,11 @@ import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.expressions.Aggregator; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.schema.oaf.Relation; import scala.Tuple2; -public class ResultAggregator extends Aggregator, ResultGroup, ResultGroup> { +public class ResultAggregator extends Aggregator, ResultGroup, ResultGroup> { /** * @@ -22,7 +22,7 @@ public class ResultAggregator extends Aggregator t) { + public ResultGroup reduce(final ResultGroup group, final Tuple2 t) { group.getData().add(t._1); return group; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultGroup.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultGroup.java index 3f9dbe8af..e718e0f1c 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultGroup.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultGroup.java @@ -5,7 +5,7 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.List; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; public class ResultGroup implements Serializable { @@ -14,13 +14,13 @@ public class ResultGroup implements Serializable { */ private static final long serialVersionUID = -3360828477088669296L; - private List data = new ArrayList<>(); + private List data = new ArrayList<>(); - public List getData() { + public List getData() { return data; } - public void setData(final List data) { + public void setData(final List data) { this.data = data; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/OpenaireBrokerResultAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/OaBrokerMainEntityAggregator.java similarity index 59% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/OpenaireBrokerResultAggregator.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/OaBrokerMainEntityAggregator.java index e72dcb988..6a2d9b06d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/OpenaireBrokerResultAggregator.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/OaBrokerMainEntityAggregator.java @@ -5,11 +5,11 @@ import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.expressions.Aggregator; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import scala.Tuple2; -public class OpenaireBrokerResultAggregator - extends Aggregator, OpenaireBrokerResult, OpenaireBrokerResult> { +public class OaBrokerMainEntityAggregator + extends Aggregator, OaBrokerMainEntity, OaBrokerMainEntity> { /** * @@ -17,17 +17,17 @@ public class OpenaireBrokerResultAggregator private static final long serialVersionUID = -3687878788861013488L; @Override - public OpenaireBrokerResult zero() { - return new OpenaireBrokerResult(); + public OaBrokerMainEntity zero() { + return new OaBrokerMainEntity(); } @Override - public OpenaireBrokerResult finish(final OpenaireBrokerResult g) { + public OaBrokerMainEntity finish(final OaBrokerMainEntity g) { return g; } @Override - public OpenaireBrokerResult reduce(final OpenaireBrokerResult g, final Tuple2 t) { + public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2 t) { if (g.getOriginalId() == null) { return t._1; } else if (t._2 instanceof RelatedSoftware) { @@ -38,13 +38,15 @@ public class OpenaireBrokerResultAggregator g.getPublications().add(((RelatedPublication) t._2).getRelPublication()); } else if (t._2 instanceof RelatedProject) { g.getProjects().add(((RelatedProject) t._2).getRelProject()); + } else { + throw new RuntimeException("Invalid Object: " + t._2.getClass()); } return g; } @Override - public OpenaireBrokerResult merge(final OpenaireBrokerResult g1, final OpenaireBrokerResult g2) { + public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { if (g1.getOriginalId() != null) { g1.getSoftwares().addAll(g2.getSoftwares()); g1.getDatasets().addAll(g2.getDatasets()); @@ -57,13 +59,13 @@ public class OpenaireBrokerResultAggregator } @Override - public Encoder bufferEncoder() { - return Encoders.bean(OpenaireBrokerResult.class); + public Encoder bufferEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); } @Override - public Encoder outputEncoder() { - return Encoders.bean(OpenaireBrokerResult.class); + public Encoder outputEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java index 6a5fb258c..daf75ea2e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; import java.io.Serializable; -import eu.dnetlib.broker.objects.Dataset; +import eu.dnetlib.broker.objects.OaBrokerRelatedDataset; public class RelatedDataset implements Serializable { @@ -13,12 +13,12 @@ public class RelatedDataset implements Serializable { private static final long serialVersionUID = 774487705184038324L; private String source; private String relType; - private Dataset relDataset; + private OaBrokerRelatedDataset relDataset; public RelatedDataset() { } - public RelatedDataset(final String source, final String relType, final Dataset relDataset) { + public RelatedDataset(final String source, final String relType, final OaBrokerRelatedDataset relDataset) { this.source = source; this.relType = relType; this.relDataset = relDataset; @@ -40,11 +40,11 @@ public class RelatedDataset implements Serializable { this.relType = relType; } - public Dataset getRelDataset() { + public OaBrokerRelatedDataset getRelDataset() { return relDataset; } - public void setRelDataset(final Dataset relDataset) { + public void setRelDataset(final OaBrokerRelatedDataset relDataset) { this.relDataset = relDataset; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java index fafec1e19..4116c8c77 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; import java.io.Serializable; -import eu.dnetlib.broker.objects.Project; +import eu.dnetlib.broker.objects.OaBrokerProject; public class RelatedProject implements Serializable { @@ -14,12 +14,12 @@ public class RelatedProject implements Serializable { private String source; private String relType; - private Project relProject; + private OaBrokerProject relProject; public RelatedProject() { } - public RelatedProject(final String source, final String relType, final Project relProject) { + public RelatedProject(final String source, final String relType, final OaBrokerProject relProject) { this.source = source; this.relType = relType; this.relProject = relProject; @@ -41,11 +41,11 @@ public class RelatedProject implements Serializable { this.relType = relType; } - public Project getRelProject() { + public OaBrokerProject getRelProject() { return relProject; } - public void setRelProject(final Project relProject) { + public void setRelProject(final OaBrokerProject relProject) { this.relProject = relProject; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java index 8a31ddf7e..9e222a952 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; import java.io.Serializable; -import eu.dnetlib.broker.objects.Publication; +import eu.dnetlib.broker.objects.OaBrokerRelatedPublication; public class RelatedPublication implements Serializable { @@ -14,12 +14,13 @@ public class RelatedPublication implements Serializable { private String source; private String relType; - private Publication relPublication; + private OaBrokerRelatedPublication relPublication; public RelatedPublication() { } - public RelatedPublication(final String source, final String relType, final Publication relPublication) { + public RelatedPublication(final String source, final String relType, + final OaBrokerRelatedPublication relPublication) { this.source = source; this.relType = relType; this.relPublication = relPublication; @@ -41,11 +42,11 @@ public class RelatedPublication implements Serializable { this.relType = relType; } - public Publication getRelPublication() { + public OaBrokerRelatedPublication getRelPublication() { return relPublication; } - public void setRelPublication(final Publication relPublication) { + public void setRelPublication(final OaBrokerRelatedPublication relPublication) { this.relPublication = relPublication; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java index 319387469..2f3b8668c 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; import java.io.Serializable; -import eu.dnetlib.broker.objects.Software; +import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware; public class RelatedSoftware implements Serializable { @@ -13,12 +13,12 @@ public class RelatedSoftware implements Serializable { private static final long serialVersionUID = 7573383356943300157L; private String source; private String relType; - private Software relSoftware; + private OaBrokerRelatedSoftware relSoftware; public RelatedSoftware() { } - public RelatedSoftware(final String source, final String relType, final Software relSoftware) { + public RelatedSoftware(final String source, final String relType, final OaBrokerRelatedSoftware relSoftware) { this.source = source; this.relType = relType; this.relSoftware = relSoftware; @@ -40,11 +40,11 @@ public class RelatedSoftware implements Serializable { this.relType = relType; } - public Software getRelSoftware() { + public OaBrokerRelatedSoftware getRelSoftware() { return relSoftware; } - public void setRelSoftware(final Software relSoftware) { + public void setRelSoftware(final OaBrokerRelatedSoftware relSoftware) { this.relSoftware = relSoftware; } diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_relations.json b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_relations.json new file mode 100644 index 000000000..32fd1d8f3 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_relations.json @@ -0,0 +1,14 @@ +[ + { + "paramName": "g", + "paramLongName": "graphPath", + "paramDescription": "the path where there the graph is stored", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "relsPath", + "paramDescription": "the path where the generated relations will be stored", + "paramRequired": true + } +] diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_simple_entities.json b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_simple_entities.json new file mode 100644 index 000000000..6f5e330f6 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_simple_entities.json @@ -0,0 +1,14 @@ +[ + { + "paramName": "g", + "paramLongName": "graphPath", + "paramDescription": "the path where there the graph is stored", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "simpleEntitiesPath", + "paramDescription": "the path where the generated simple entities (without relations) will be stored", + "paramRequired": true + } +] From 7d416f08d8838d33e2d37b24d520e11d97aada1f Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 22 Jun 2020 09:50:43 +0200 Subject: [PATCH 07/37] graph cleaning workflow: set hostedby to unknown repository when defined as NULL --- .../dnetlib/dhp/schema/common/ModelConstants.java | 15 +++++++++++++++ .../dhp/oa/graph/clean/CleanGraphSparkJob.java | 3 +++ 2 files changed, 18 insertions(+) diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java index a3c1610db..c5905e45b 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java @@ -1,6 +1,10 @@ package eu.dnetlib.dhp.schema.common; +import java.security.Key; + +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Qualifier; public class ModelConstants { @@ -95,6 +99,9 @@ public class ModelConstants { SYSIMPORT_CROSSWALK_ENTITYREGISTRY, SYSIMPORT_CROSSWALK_ENTITYREGISTRY, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS); + public static final KeyValue UNKNOWN_REPOSITORY = keyValue( + "10|openaire____::55045bd2a65019fd8e6741a755395c8c", "Unknown Repository"); + private static Qualifier qualifier( final String classid, final String classname, @@ -107,4 +114,12 @@ public class ModelConstants { q.setSchemename(schemename); return q; } + + private static KeyValue keyValue(String key, String value) { + KeyValue kv = new KeyValue(); + kv.setKey(key); + kv.setValue(value); + kv.setDataInfo(new DataInfo()); + return kv; + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java index c90898814..8f43ab1cf 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java @@ -144,6 +144,9 @@ public class CleanGraphSparkJob { if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) { i.setAccessright(qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES)); } + if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) { + i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY); + } } } From 961a0d0b4952a51aaf27fb10c3a6a2bcba018710 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 22 Jun 2020 10:20:45 +0200 Subject: [PATCH 08/37] [actionset promotion] log debugging info in case of error in the action payload extraction or parsing the data --- .../PromoteActionPayloadForGraphTableJob.java | 29 +++++++++++++++---- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java index 17bfc4af3..5fa9e6723 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java @@ -4,6 +4,7 @@ package eu.dnetlib.dhp.actionmanager.promote; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; +import java.io.IOException; import java.util.Objects; import java.util.Optional; import java.util.function.BiFunction; @@ -20,6 +21,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.exc.UnrecognizedPropertyException; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; @@ -134,24 +136,39 @@ public class PromoteActionPayloadForGraphTableJob { .map( (MapFunction) value -> OBJECT_MAPPER.readValue(value, rowClazz), Encoders.bean(rowClazz)); - - /* - * return spark .read() .parquet(path) .as(Encoders.bean(rowClazz)); - */ } private static Dataset readActionPayload( SparkSession spark, String path, Class actionPayloadClazz) { logger.info("Reading action payload from path: {}", path); + return spark .read() .parquet(path) + .map((MapFunction) value -> extractPayload(value), Encoders.STRING()) .map( - (MapFunction) value -> OBJECT_MAPPER - .readValue(value. getAs("payload"), actionPayloadClazz), + (MapFunction) value -> decodePayload(actionPayloadClazz, value), Encoders.bean(actionPayloadClazz)); } + private static String extractPayload(Row value) { + try { + return value. getAs("payload"); + } catch (IllegalArgumentException | ClassCastException e) { + logger.error("cannot extract payload from action: {}", value.toString()); + throw e; + } + } + + private static A decodePayload(Class actionPayloadClazz, String payload) throws IOException { + try { + return OBJECT_MAPPER.readValue(payload, actionPayloadClazz); + } catch (UnrecognizedPropertyException e) { + logger.error("error decoding payload: {}", payload); + throw e; + } + } + private static Dataset promoteActionPayloadForGraphTable( Dataset rowDS, Dataset actionPayloadDS, From 1e3dab06318516b1eda81fe42dd370aa2c726d0b Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 22 Jun 2020 11:27:39 +0200 Subject: [PATCH 09/37] [maven-release-plugin] prepare release dhp-1.2.3 --- dhp-build/dhp-build-assembly-resources/pom.xml | 2 +- dhp-build/dhp-build-properties-maven-plugin/pom.xml | 2 +- dhp-build/dhp-code-style/pom.xml | 2 +- dhp-build/pom.xml | 2 +- dhp-common/pom.xml | 2 +- dhp-schemas/pom.xml | 2 +- dhp-workflows/dhp-actionmanager/pom.xml | 2 +- dhp-workflows/dhp-aggregation/pom.xml | 2 +- dhp-workflows/dhp-blacklist/pom.xml | 2 +- dhp-workflows/dhp-broker-events/pom.xml | 2 +- dhp-workflows/dhp-dedup-openaire/pom.xml | 2 +- dhp-workflows/dhp-dedup-scholexplorer/pom.xml | 2 +- dhp-workflows/dhp-distcp/pom.xml | 2 +- dhp-workflows/dhp-doiboost/pom.xml | 2 +- dhp-workflows/dhp-enrichment/pom.xml | 2 +- dhp-workflows/dhp-graph-mapper/pom.xml | 2 +- dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml | 2 +- dhp-workflows/dhp-graph-provision/pom.xml | 2 +- dhp-workflows/dhp-stats-update/pom.xml | 2 +- dhp-workflows/dhp-worfklow-profiles/pom.xml | 2 +- dhp-workflows/pom.xml | 2 +- pom.xml | 4 ++-- 22 files changed, 23 insertions(+), 23 deletions(-) diff --git a/dhp-build/dhp-build-assembly-resources/pom.xml b/dhp-build/dhp-build-assembly-resources/pom.xml index 2a89a26fd..2eb7e76c6 100644 --- a/dhp-build/dhp-build-assembly-resources/pom.xml +++ b/dhp-build/dhp-build-assembly-resources/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp-build - 1.2.3-SNAPSHOT + 1.2.3 dhp-build-assembly-resources diff --git a/dhp-build/dhp-build-properties-maven-plugin/pom.xml b/dhp-build/dhp-build-properties-maven-plugin/pom.xml index 5be114e3c..6696f2f53 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/pom.xml +++ b/dhp-build/dhp-build-properties-maven-plugin/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp-build - 1.2.3-SNAPSHOT + 1.2.3 dhp-build-properties-maven-plugin diff --git a/dhp-build/dhp-code-style/pom.xml b/dhp-build/dhp-code-style/pom.xml index 515ed35ce..40a5df58d 100644 --- a/dhp-build/dhp-code-style/pom.xml +++ b/dhp-build/dhp-code-style/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib.dhp dhp-code-style - 1.2.3-SNAPSHOT + 1.2.3 jar diff --git a/dhp-build/pom.xml b/dhp-build/pom.xml index d2dcbc36e..a787b2fb5 100644 --- a/dhp-build/pom.xml +++ b/dhp-build/pom.xml @@ -4,7 +4,7 @@ eu.dnetlib.dhp dhp - 1.2.3-SNAPSHOT + 1.2.3 dhp-build pom diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index 0e7652dd3..c79662380 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib.dhp dhp - 1.2.3-SNAPSHOT + 1.2.3 ../ diff --git a/dhp-schemas/pom.xml b/dhp-schemas/pom.xml index 56fb8ead2..a770ab6b1 100644 --- a/dhp-schemas/pom.xml +++ b/dhp-schemas/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib.dhp dhp - 1.2.3-SNAPSHOT + 1.2.3 ../ diff --git a/dhp-workflows/dhp-actionmanager/pom.xml b/dhp-workflows/dhp-actionmanager/pom.xml index b50c6705b..d978b4e1e 100644 --- a/dhp-workflows/dhp-actionmanager/pom.xml +++ b/dhp-workflows/dhp-actionmanager/pom.xml @@ -4,7 +4,7 @@ eu.dnetlib.dhp dhp-workflows - 1.2.3-SNAPSHOT + 1.2.3 dhp-actionmanager diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index c04910a58..da05f16bd 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -4,7 +4,7 @@ eu.dnetlib.dhp dhp-workflows - 1.2.3-SNAPSHOT + 1.2.3 dhp-aggregation diff --git a/dhp-workflows/dhp-blacklist/pom.xml b/dhp-workflows/dhp-blacklist/pom.xml index 04d334cd7..14bc6bc98 100644 --- a/dhp-workflows/dhp-blacklist/pom.xml +++ b/dhp-workflows/dhp-blacklist/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.3 4.0.0 diff --git a/dhp-workflows/dhp-broker-events/pom.xml b/dhp-workflows/dhp-broker-events/pom.xml index 8d7d3b88c..3ebebc579 100644 --- a/dhp-workflows/dhp-broker-events/pom.xml +++ b/dhp-workflows/dhp-broker-events/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.3 4.0.0 diff --git a/dhp-workflows/dhp-dedup-openaire/pom.xml b/dhp-workflows/dhp-dedup-openaire/pom.xml index 1f5f2620e..ec5ff7308 100644 --- a/dhp-workflows/dhp-dedup-openaire/pom.xml +++ b/dhp-workflows/dhp-dedup-openaire/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.3 4.0.0 dhp-dedup-openaire diff --git a/dhp-workflows/dhp-dedup-scholexplorer/pom.xml b/dhp-workflows/dhp-dedup-scholexplorer/pom.xml index e9e11b417..4aa3ebced 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/pom.xml +++ b/dhp-workflows/dhp-dedup-scholexplorer/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.3 4.0.0 diff --git a/dhp-workflows/dhp-distcp/pom.xml b/dhp-workflows/dhp-distcp/pom.xml index 5707ddfc5..5e2995340 100644 --- a/dhp-workflows/dhp-distcp/pom.xml +++ b/dhp-workflows/dhp-distcp/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.3 4.0.0 diff --git a/dhp-workflows/dhp-doiboost/pom.xml b/dhp-workflows/dhp-doiboost/pom.xml index 39bb81ec1..40fc62c2d 100644 --- a/dhp-workflows/dhp-doiboost/pom.xml +++ b/dhp-workflows/dhp-doiboost/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.3 4.0.0 diff --git a/dhp-workflows/dhp-enrichment/pom.xml b/dhp-workflows/dhp-enrichment/pom.xml index e71a72f3e..43ad12e0c 100644 --- a/dhp-workflows/dhp-enrichment/pom.xml +++ b/dhp-workflows/dhp-enrichment/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.3 4.0.0 diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml index f650f1c17..b4f7a4059 100644 --- a/dhp-workflows/dhp-graph-mapper/pom.xml +++ b/dhp-workflows/dhp-graph-mapper/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.3 4.0.0 diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml index 2466ca8e2..c27c1b1b8 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.3 4.0.0 diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml index b0aec1e5d..69dcd249e 100644 --- a/dhp-workflows/dhp-graph-provision/pom.xml +++ b/dhp-workflows/dhp-graph-provision/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.3 4.0.0 diff --git a/dhp-workflows/dhp-stats-update/pom.xml b/dhp-workflows/dhp-stats-update/pom.xml index 397bd8d08..148e93ae7 100644 --- a/dhp-workflows/dhp-stats-update/pom.xml +++ b/dhp-workflows/dhp-stats-update/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.3 4.0.0 dhp-stats-update diff --git a/dhp-workflows/dhp-worfklow-profiles/pom.xml b/dhp-workflows/dhp-worfklow-profiles/pom.xml index e03362034..34c622b1a 100644 --- a/dhp-workflows/dhp-worfklow-profiles/pom.xml +++ b/dhp-workflows/dhp-worfklow-profiles/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.3 4.0.0 diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index 8d8d57c84..1a1082ba2 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp - 1.2.3-SNAPSHOT + 1.2.3 ../ diff --git a/pom.xml b/pom.xml index 06e2b7aaf..6ec6bfb3a 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 eu.dnetlib.dhp dhp - 1.2.3-SNAPSHOT + 1.2.3 pom @@ -38,7 +38,7 @@ scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git https://code-repo.d4science.org/D-Net/dnet-hadoop/ - HEAD + dhp-1.2.3 This module is the root descriptor for the dnet-hadoop project From 9cd27183b6738e2bddbf1890bb37547bde054bca Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 22 Jun 2020 11:27:44 +0200 Subject: [PATCH 10/37] [maven-release-plugin] prepare for next development iteration --- dhp-build/dhp-build-assembly-resources/pom.xml | 2 +- dhp-build/dhp-build-properties-maven-plugin/pom.xml | 2 +- dhp-build/dhp-code-style/pom.xml | 2 +- dhp-build/pom.xml | 2 +- dhp-common/pom.xml | 2 +- dhp-schemas/pom.xml | 2 +- dhp-workflows/dhp-actionmanager/pom.xml | 2 +- dhp-workflows/dhp-aggregation/pom.xml | 2 +- dhp-workflows/dhp-blacklist/pom.xml | 2 +- dhp-workflows/dhp-broker-events/pom.xml | 2 +- dhp-workflows/dhp-dedup-openaire/pom.xml | 2 +- dhp-workflows/dhp-dedup-scholexplorer/pom.xml | 2 +- dhp-workflows/dhp-distcp/pom.xml | 2 +- dhp-workflows/dhp-doiboost/pom.xml | 2 +- dhp-workflows/dhp-enrichment/pom.xml | 2 +- dhp-workflows/dhp-graph-mapper/pom.xml | 2 +- dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml | 2 +- dhp-workflows/dhp-graph-provision/pom.xml | 2 +- dhp-workflows/dhp-stats-update/pom.xml | 2 +- dhp-workflows/dhp-worfklow-profiles/pom.xml | 2 +- dhp-workflows/pom.xml | 2 +- pom.xml | 4 ++-- 22 files changed, 23 insertions(+), 23 deletions(-) diff --git a/dhp-build/dhp-build-assembly-resources/pom.xml b/dhp-build/dhp-build-assembly-resources/pom.xml index 2eb7e76c6..012ff89a3 100644 --- a/dhp-build/dhp-build-assembly-resources/pom.xml +++ b/dhp-build/dhp-build-assembly-resources/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp-build - 1.2.3 + 1.2.4-SNAPSHOT dhp-build-assembly-resources diff --git a/dhp-build/dhp-build-properties-maven-plugin/pom.xml b/dhp-build/dhp-build-properties-maven-plugin/pom.xml index 6696f2f53..256017e2c 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/pom.xml +++ b/dhp-build/dhp-build-properties-maven-plugin/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp-build - 1.2.3 + 1.2.4-SNAPSHOT dhp-build-properties-maven-plugin diff --git a/dhp-build/dhp-code-style/pom.xml b/dhp-build/dhp-code-style/pom.xml index 40a5df58d..e60e8076e 100644 --- a/dhp-build/dhp-code-style/pom.xml +++ b/dhp-build/dhp-code-style/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib.dhp dhp-code-style - 1.2.3 + 1.2.4-SNAPSHOT jar diff --git a/dhp-build/pom.xml b/dhp-build/pom.xml index a787b2fb5..12b999b9c 100644 --- a/dhp-build/pom.xml +++ b/dhp-build/pom.xml @@ -4,7 +4,7 @@ eu.dnetlib.dhp dhp - 1.2.3 + 1.2.4-SNAPSHOT dhp-build pom diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index c79662380..0819a8bd2 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib.dhp dhp - 1.2.3 + 1.2.4-SNAPSHOT ../ diff --git a/dhp-schemas/pom.xml b/dhp-schemas/pom.xml index a770ab6b1..2e5652b43 100644 --- a/dhp-schemas/pom.xml +++ b/dhp-schemas/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib.dhp dhp - 1.2.3 + 1.2.4-SNAPSHOT ../ diff --git a/dhp-workflows/dhp-actionmanager/pom.xml b/dhp-workflows/dhp-actionmanager/pom.xml index d978b4e1e..0b4d25700 100644 --- a/dhp-workflows/dhp-actionmanager/pom.xml +++ b/dhp-workflows/dhp-actionmanager/pom.xml @@ -4,7 +4,7 @@ eu.dnetlib.dhp dhp-workflows - 1.2.3 + 1.2.4-SNAPSHOT dhp-actionmanager diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index da05f16bd..a1bc1c483 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -4,7 +4,7 @@ eu.dnetlib.dhp dhp-workflows - 1.2.3 + 1.2.4-SNAPSHOT dhp-aggregation diff --git a/dhp-workflows/dhp-blacklist/pom.xml b/dhp-workflows/dhp-blacklist/pom.xml index 14bc6bc98..9c25f7b29 100644 --- a/dhp-workflows/dhp-blacklist/pom.xml +++ b/dhp-workflows/dhp-blacklist/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3 + 1.2.4-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-broker-events/pom.xml b/dhp-workflows/dhp-broker-events/pom.xml index 3ebebc579..424015a3c 100644 --- a/dhp-workflows/dhp-broker-events/pom.xml +++ b/dhp-workflows/dhp-broker-events/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3 + 1.2.4-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-dedup-openaire/pom.xml b/dhp-workflows/dhp-dedup-openaire/pom.xml index ec5ff7308..03ddbcf4c 100644 --- a/dhp-workflows/dhp-dedup-openaire/pom.xml +++ b/dhp-workflows/dhp-dedup-openaire/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3 + 1.2.4-SNAPSHOT 4.0.0 dhp-dedup-openaire diff --git a/dhp-workflows/dhp-dedup-scholexplorer/pom.xml b/dhp-workflows/dhp-dedup-scholexplorer/pom.xml index 4aa3ebced..aa4070b01 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/pom.xml +++ b/dhp-workflows/dhp-dedup-scholexplorer/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3 + 1.2.4-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-distcp/pom.xml b/dhp-workflows/dhp-distcp/pom.xml index 5e2995340..8c10538c0 100644 --- a/dhp-workflows/dhp-distcp/pom.xml +++ b/dhp-workflows/dhp-distcp/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3 + 1.2.4-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-doiboost/pom.xml b/dhp-workflows/dhp-doiboost/pom.xml index 40fc62c2d..3299c1496 100644 --- a/dhp-workflows/dhp-doiboost/pom.xml +++ b/dhp-workflows/dhp-doiboost/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3 + 1.2.4-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-enrichment/pom.xml b/dhp-workflows/dhp-enrichment/pom.xml index 43ad12e0c..d0ab77cc5 100644 --- a/dhp-workflows/dhp-enrichment/pom.xml +++ b/dhp-workflows/dhp-enrichment/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3 + 1.2.4-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml index b4f7a4059..0439c2ba3 100644 --- a/dhp-workflows/dhp-graph-mapper/pom.xml +++ b/dhp-workflows/dhp-graph-mapper/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3 + 1.2.4-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml index c27c1b1b8..05ca7d4ce 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3 + 1.2.4-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml index 69dcd249e..fa1964773 100644 --- a/dhp-workflows/dhp-graph-provision/pom.xml +++ b/dhp-workflows/dhp-graph-provision/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3 + 1.2.4-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-stats-update/pom.xml b/dhp-workflows/dhp-stats-update/pom.xml index 148e93ae7..52f35ff07 100644 --- a/dhp-workflows/dhp-stats-update/pom.xml +++ b/dhp-workflows/dhp-stats-update/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3 + 1.2.4-SNAPSHOT 4.0.0 dhp-stats-update diff --git a/dhp-workflows/dhp-worfklow-profiles/pom.xml b/dhp-workflows/dhp-worfklow-profiles/pom.xml index 34c622b1a..34996a021 100644 --- a/dhp-workflows/dhp-worfklow-profiles/pom.xml +++ b/dhp-workflows/dhp-worfklow-profiles/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3 + 1.2.4-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index 1a1082ba2..9fbc6d714 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp - 1.2.3 + 1.2.4-SNAPSHOT ../ diff --git a/pom.xml b/pom.xml index 6ec6bfb3a..89b7e8829 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 eu.dnetlib.dhp dhp - 1.2.3 + 1.2.4-SNAPSHOT pom @@ -38,7 +38,7 @@ scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git https://code-repo.d4science.org/D-Net/dnet-hadoop/ - dhp-1.2.3 + HEAD This module is the root descriptor for the dnet-hadoop project From ed787398b37d2276765c6ca5b3e57a628ca2b6bd Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Mon, 22 Jun 2020 11:45:14 +0200 Subject: [PATCH 11/37] refactoring wf --- .../broker/oa/GenerateEventsApplication.java | 190 ---------------- .../dhp/broker/oa/GenerateEventsJob.java | 106 +++++++++ .../dhp/broker/oa/JoinEntitiesJob.java | 94 ++++++++ .../dhp/broker/oa/PrepareGroupsJob.java | 88 ++++++++ ...ts.java => PrepareRelatedDatasetsJob.java} | 13 +- ...ts.java => PrepareRelatedProjectsJob.java} | 13 +- ...ava => PrepareRelatedPublicationsJob.java} | 13 +- ...s.java => PrepareRelatedSoftwaresJob.java} | 14 +- ...s.java => PrepareSimpleEntititiesJob.java} | 34 ++- .../dhp/broker/oa/util/ClusterUtils.java | 4 + ...rate_relations.json => common_params.json} | 4 +- .../oa/generate_all/oozie_app/workflow.xml | 210 +++++++++++++++++- ...roker_events.json => generate_events.json} | 2 +- .../broker/oa/generate_simple_entities.json | 14 -- 14 files changed, 546 insertions(+), 253 deletions(-) delete mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinEntitiesJob.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/{GenerateRelatedDatasets.java => PrepareRelatedDatasetsJob.java} (82%) rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/{GenerateRelatedProjects.java => PrepareRelatedProjectsJob.java} (84%) rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/{GenerateRelatedPublications.java => PrepareRelatedPublicationsJob.java} (83%) rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/{GenerateRelatedSoftwares.java => PrepareRelatedSoftwaresJob.java} (83%) rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/{GenerateSimpleEntitities.java => PrepareSimpleEntititiesJob.java} (68%) rename dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/{generate_relations.json => common_params.json} (64%) rename dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/{generate_broker_events.json => generate_events.json} (94%) delete mode 100644 dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_simple_entities.json diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java deleted file mode 100644 index db5992010..000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java +++ /dev/null @@ -1,190 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - -import java.util.Optional; - -import org.apache.commons.io.IOUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.TypedColumn; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.broker.objects.OaBrokerMainEntity; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.broker.model.Event; -import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; -import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; -import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; -import eu.dnetlib.dhp.broker.oa.util.EventFinder; -import eu.dnetlib.dhp.broker.oa.util.EventGroup; -import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultAggregator; -import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.OaBrokerMainEntityAggregator; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.oaf.Result; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import eu.dnetlib.pace.config.DedupConfig; -import scala.Tuple2; - -public class GenerateEventsApplication { - - private static final Logger log = LoggerFactory.getLogger(GenerateEventsApplication.class); - - public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - GenerateEventsApplication.class - .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_broker_events.json"))); - parser.parseArgument(args); - - final Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - - final String graphPath = parser.get("graphPath"); - log.info("graphPath: {}", graphPath); - - final String eventsPath = parser.get("eventsPath"); - log.info("eventsPath: {}", eventsPath); - - final String isLookupUrl = parser.get("isLookupUrl"); - log.info("isLookupUrl: {}", isLookupUrl); - - final String dedupConfigProfileId = parser.get("dedupConfProfile"); - log.info("dedupConfigProfileId: {}", dedupConfigProfileId); - - final SparkConf conf = new SparkConf(); - // conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - // conf.registerKryoClasses(BrokerConstants.getModelClasses()); - - // TODO UNCOMMENT - // final DedupConfig dedupConfig = loadDedupConfig(isLookupUrl, dedupConfigProfileId); - final DedupConfig dedupConfig = null; - - runWithSparkSession(conf, isSparkSessionManaged, spark -> { - - ClusterUtils.removeDir(spark, eventsPath); - - // TODO REMOVE THIS - - expandResultsWithRelations(spark, graphPath, Publication.class) - .write() - .mode(SaveMode.Overwrite) - .json(eventsPath); - - // TODO UNCOMMENT THIS - // spark - // .emptyDataset(Encoders.bean(Event.class)) - // .union(generateEvents(spark, graphPath, Publication.class, dedupConfig)) - // .union(generateEvents(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class, dedupConfig)) - // .union(generateEvents(spark, graphPath, Software.class, dedupConfig)) - // .union(generateEvents(spark, graphPath, OtherResearchProduct.class, dedupConfig)) - // .write() - // .mode(SaveMode.Overwrite) - // .option("compression", "gzip") - // .json(eventsPath); - }); - - } - - private static Dataset generateEvents( - final SparkSession spark, - final String graphPath, - final Class sourceClass, - final DedupConfig dedupConfig) { - - final Dataset results = expandResultsWithRelations(spark, graphPath, sourceClass); - - final Dataset mergedRels = ClusterUtils - .readPath(spark, graphPath + "/relation", Relation.class) - .filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)); - - final TypedColumn, ResultGroup> aggr = new ResultAggregator() - .toColumn(); - - return results - .joinWith(mergedRels, results.col("openaireId").equalTo(mergedRels.col("source")), "inner") - .groupByKey( - (MapFunction, String>) t -> t._2.getTarget(), Encoders.STRING()) - .agg(aggr) - .map((MapFunction, ResultGroup>) t -> t._2, Encoders.bean(ResultGroup.class)) - .filter(rg -> rg.getData().size() > 1) - .map( - (MapFunction) g -> EventFinder.generateEvents(g, dedupConfig), - Encoders.bean(EventGroup.class)) - .flatMap(group -> group.getData().iterator(), Encoders.bean(Event.class)); - } - - private static Dataset expandResultsWithRelations( - final SparkSession spark, - final String graphPath, - final Class sourceClass) { - - // final Dataset datasets = readPath( - // spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class); - // final Dataset softwares = readPath(spark, graphPath + "/software", Software.class); - // final Dataset publications = readPath(spark, graphPath + "/publication", Publication.class); - - final Dataset r0 = ClusterUtils - .readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass) - .filter(r -> r.getDataInfo().getDeletedbyinference()) - .map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OaBrokerMainEntity.class)); - - // TODO UNCOMMENT THIS - // final Dataset r1 = join(r0, relatedProjects(spark, graphPath)); - // final Dataset r2 = join(r1, relatedDataset(spark, graphPath)); - // final Dataset r3 = join(r2, relatedPublications(spark, graphPath)); - // final Dataset r4 = join(r3, relatedSoftwares(spark, graphPath)); - - return r0; // TODO it should be r4 - } - - private static Dataset join(final Dataset sources, - final Dataset typedRels) { - - final TypedColumn, OaBrokerMainEntity> aggr = new OaBrokerMainEntityAggregator() - .toColumn(); - - return sources - .joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer") - .groupByKey( - (MapFunction, String>) t -> t._1.getOpenaireId(), Encoders.STRING()) - .agg(aggr) - .map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class)); - - } - - private static DedupConfig loadDedupConfig(final String isLookupUrl, final String profId) throws Exception { - - final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl); - - final String conf = isLookUpService - .getResourceProfileByQuery( - String - .format( - "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", - profId)); - - final DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class); - dedupConfig.getPace().initModel(); - dedupConfig.getPace().initTranslationMap(); - // dedupConfig.getWf().setConfigurationId("???"); - - return dedupConfig; - } - -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java new file mode 100644 index 000000000..3ea0086ff --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java @@ -0,0 +1,106 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.model.Event; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.EventFinder; +import eu.dnetlib.dhp.broker.oa.util.EventGroup; +import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import eu.dnetlib.pace.config.DedupConfig; + +public class GenerateEventsJob { + + private static final Logger log = LoggerFactory.getLogger(GenerateEventsJob.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GenerateEventsJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_events.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String graphPath = parser.get("graphPath"); + log.info("graphPath: {}", graphPath); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String isLookupUrl = parser.get("isLookupUrl"); + log.info("isLookupUrl: {}", isLookupUrl); + + final String dedupConfigProfileId = parser.get("dedupConfProfile"); + log.info("dedupConfigProfileId: {}", dedupConfigProfileId); + + final String eventsPath = workingPath + "/eventsPath"; + log.info("eventsPath: {}", eventsPath); + + final SparkConf conf = new SparkConf(); + + // TODO UNCOMMENT + // final DedupConfig dedupConfig = loadDedupConfig(isLookupUrl, dedupConfigProfileId); + final DedupConfig dedupConfig = null; + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, eventsPath); + + final Dataset groups = ClusterUtils + .readPath(spark, graphPath + "/relation", ResultGroup.class); + + final Dataset events = groups + .map( + (MapFunction) g -> EventFinder.generateEvents(g, dedupConfig), + Encoders.bean(EventGroup.class)) + .flatMap(group -> group.getData().iterator(), Encoders.bean(Event.class)); + + events.write().mode(SaveMode.Overwrite).json(eventsPath); + + }); + + } + + private static DedupConfig loadDedupConfig(final String isLookupUrl, final String profId) throws Exception { + + final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl); + + final String conf = isLookUpService + .getResourceProfileByQuery( + String + .format( + "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", + profId)); + + final DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class); + dedupConfig.getPace().initModel(); + dedupConfig.getPace().initTranslationMap(); + // dedupConfig.getWf().setConfigurationId("???"); + + return dedupConfig; + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinEntitiesJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinEntitiesJob.java new file mode 100644 index 000000000..dac308f36 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinEntitiesJob.java @@ -0,0 +1,94 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.TypedColumn; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.OaBrokerMainEntityAggregator; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDataset; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublication; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware; +import scala.Tuple2; + +public class JoinEntitiesJob { + + private static final Logger log = LoggerFactory.getLogger(JoinEntitiesJob.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + JoinEntitiesJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String graphPath = parser.get("graphPath"); + log.info("graphPath: {}", graphPath); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String joinedEntitiesPath = workingPath + "/joinedEntities"; + log.info("joinedEntitiesPath: {}", joinedEntitiesPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, joinedEntitiesPath); + + final Dataset r0 = ClusterUtils + .readPath(spark, graphPath + "/simpleEntities", OaBrokerMainEntity.class); + + final Dataset r1 = join( + r0, ClusterUtils.readPath(spark, graphPath + "/relatedProjects", RelatedProject.class)); + final Dataset r2 = join( + r1, ClusterUtils.readPath(spark, graphPath + "/relatedDatasets", RelatedDataset.class)); + final Dataset r3 = join( + r2, ClusterUtils.readPath(spark, graphPath + "/relatedPublications", RelatedPublication.class)); + final Dataset r4 = join( + r3, ClusterUtils.readPath(spark, graphPath + "/relatedSoftwares", RelatedSoftware.class)); + + r4.write().mode(SaveMode.Overwrite).json(joinedEntitiesPath); + + }); + + } + + private static Dataset join(final Dataset sources, + final Dataset typedRels) { + + final TypedColumn, OaBrokerMainEntity> aggr = new OaBrokerMainEntityAggregator() + .toColumn(); + + return sources + .joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer") + .groupByKey( + (MapFunction, String>) t -> t._1.getOpenaireId(), Encoders.STRING()) + .agg(aggr) + .map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class)); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java new file mode 100644 index 000000000..aa057eee8 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java @@ -0,0 +1,88 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.TypedColumn; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultAggregator; +import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup; +import eu.dnetlib.dhp.schema.oaf.Relation; +import scala.Tuple2; + +public class PrepareGroupsJob { + + private static final Logger log = LoggerFactory.getLogger(PrepareGroupsJob.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + JoinEntitiesJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String graphPath = parser.get("graphPath"); + log.info("graphPath: {}", graphPath); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String groupsPath = workingPath + "/groups"; + log.info("groupsPath: {}", groupsPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, groupsPath); + + final Dataset results = ClusterUtils + .readPath(spark, graphPath + "/joinedEntities", OaBrokerMainEntity.class); + + final Dataset mergedRels = ClusterUtils + .readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)); + + final TypedColumn, ResultGroup> aggr = new ResultAggregator() + .toColumn(); + + final Dataset groups = results + .joinWith(mergedRels, results.col("openaireId").equalTo(mergedRels.col("source")), "inner") + .groupByKey( + (MapFunction, String>) t -> t._2.getTarget(), + Encoders.STRING()) + .agg(aggr) + .map( + (MapFunction, ResultGroup>) t -> t._2, Encoders.bean(ResultGroup.class)) + .filter(rg -> rg.getData().size() > 1); + + groups + .write() + .mode(SaveMode.Overwrite) + .json(groupsPath); + + }); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedDatasets.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java similarity index 82% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedDatasets.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java index 4a10fbabf..edf9b9a7e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedDatasets.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java @@ -19,16 +19,16 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDataset; import eu.dnetlib.dhp.schema.oaf.Relation; -public class GenerateRelatedDatasets { +public class PrepareRelatedDatasetsJob { - private static final Logger log = LoggerFactory.getLogger(GenerateRelatedDatasets.class); + private static final Logger log = LoggerFactory.getLogger(PrepareRelatedDatasetsJob.class); public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils .toString( - GenerateRelatedDatasets.class - .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_relations.json"))); + PrepareRelatedDatasetsJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); parser.parseArgument(args); final Boolean isSparkSessionManaged = Optional @@ -40,7 +40,10 @@ public class GenerateRelatedDatasets { final String graphPath = parser.get("graphPath"); log.info("graphPath: {}", graphPath); - final String relsPath = parser.get("relsPath"); + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String relsPath = workingPath + "/relatedDatasets"; log.info("relsPath: {}", relsPath); final SparkConf conf = new SparkConf(); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedProjects.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java similarity index 84% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedProjects.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java index 59ed388e7..00957972a 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedProjects.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java @@ -23,9 +23,9 @@ import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Relation; -public class GenerateRelatedProjects { +public class PrepareRelatedProjectsJob { - private static final Logger log = LoggerFactory.getLogger(GenerateRelatedProjects.class); + private static final Logger log = LoggerFactory.getLogger(PrepareRelatedProjectsJob.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -33,8 +33,8 @@ public class GenerateRelatedProjects { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils .toString( - GenerateRelatedProjects.class - .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_relations.json"))); + PrepareRelatedProjectsJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); parser.parseArgument(args); final Boolean isSparkSessionManaged = Optional @@ -46,7 +46,10 @@ public class GenerateRelatedProjects { final String graphPath = parser.get("graphPath"); log.info("graphPath: {}", graphPath); - final String relsPath = parser.get("relsPath"); + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String relsPath = workingPath + "/relatedProjects"; log.info("relsPath: {}", relsPath); final SparkConf conf = new SparkConf(); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedPublications.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java similarity index 83% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedPublications.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java index 0c20081dc..945fd9ed7 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedPublications.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java @@ -22,9 +22,9 @@ import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublication; import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Relation; -public class GenerateRelatedPublications { +public class PrepareRelatedPublicationsJob { - private static final Logger log = LoggerFactory.getLogger(GenerateRelatedPublications.class); + private static final Logger log = LoggerFactory.getLogger(PrepareRelatedPublicationsJob.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -32,8 +32,8 @@ public class GenerateRelatedPublications { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils .toString( - GenerateRelatedPublications.class - .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_relations.json"))); + PrepareRelatedPublicationsJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); parser.parseArgument(args); final Boolean isSparkSessionManaged = Optional @@ -45,7 +45,10 @@ public class GenerateRelatedPublications { final String graphPath = parser.get("graphPath"); log.info("graphPath: {}", graphPath); - final String relsPath = parser.get("relsPath"); + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String relsPath = workingPath + "/relatedPublications"; log.info("relsPath: {}", relsPath); final SparkConf conf = new SparkConf(); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedSoftwares.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java similarity index 83% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedSoftwares.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java index b95788846..edb8dc1c3 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedSoftwares.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java @@ -22,9 +22,9 @@ import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Software; -public class GenerateRelatedSoftwares { +public class PrepareRelatedSoftwaresJob { - private static final Logger log = LoggerFactory.getLogger(GenerateRelatedSoftwares.class); + private static final Logger log = LoggerFactory.getLogger(PrepareRelatedSoftwaresJob.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -32,8 +32,8 @@ public class GenerateRelatedSoftwares { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils .toString( - GenerateRelatedSoftwares.class - .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_relations.json"))); + PrepareRelatedSoftwaresJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); parser.parseArgument(args); final Boolean isSparkSessionManaged = Optional @@ -45,7 +45,10 @@ public class GenerateRelatedSoftwares { final String graphPath = parser.get("graphPath"); log.info("graphPath: {}", graphPath); - final String relsPath = parser.get("relsPath"); + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String relsPath = workingPath + "/relatedSoftwares"; log.info("relsPath: {}", relsPath); final SparkConf conf = new SparkConf(); @@ -53,6 +56,7 @@ public class GenerateRelatedSoftwares { runWithSparkSession(conf, isSparkSessionManaged, spark -> { ClusterUtils.removeDir(spark, relsPath); + final Dataset softwares = ClusterUtils.readPath(spark, graphPath + "/software", Software.class); final Dataset rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateSimpleEntitities.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java similarity index 68% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateSimpleEntitities.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java index 59485d5cf..213003db2 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateSimpleEntitities.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java @@ -18,19 +18,21 @@ import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.Software; -public class GenerateSimpleEntitities { +public class PrepareSimpleEntititiesJob { - private static final Logger log = LoggerFactory.getLogger(GenerateSimpleEntitities.class); + private static final Logger log = LoggerFactory.getLogger(PrepareSimpleEntititiesJob.class); public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils .toString( - GenerateSimpleEntitities.class - .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_simple_entities.json"))); + PrepareSimpleEntititiesJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); parser.parseArgument(args); final Boolean isSparkSessionManaged = Optional @@ -42,7 +44,10 @@ public class GenerateSimpleEntitities { final String graphPath = parser.get("graphPath"); log.info("graphPath: {}", graphPath); - final String simpleEntitiesPath = parser.get("simpleEntitiesPath"); + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String simpleEntitiesPath = workingPath + "/simpleEntities"; log.info("simpleEntitiesPath: {}", simpleEntitiesPath); final SparkConf conf = new SparkConf(); @@ -51,27 +56,18 @@ public class GenerateSimpleEntitities { ClusterUtils.removeDir(spark, simpleEntitiesPath); - expandResultsWithRelations(spark, graphPath, Publication.class) + prepareSimpleEntities(spark, graphPath, Publication.class) + .union(prepareSimpleEntities(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class)) + .union(prepareSimpleEntities(spark, graphPath, Software.class)) + .union(prepareSimpleEntities(spark, graphPath, OtherResearchProduct.class)) .write() .mode(SaveMode.Overwrite) .json(simpleEntitiesPath); - - // TODO UNCOMMENT THIS - // spark - // .emptyDataset(Encoders.bean(Event.class)) - // .union(generateEvents(spark, graphPath, Publication.class, dedupConfig)) - // .union(generateEvents(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class, dedupConfig)) - // .union(generateEvents(spark, graphPath, Software.class, dedupConfig)) - // .union(generateEvents(spark, graphPath, OtherResearchProduct.class, dedupConfig)) - // .write() - // .mode(SaveMode.Overwrite) - // .option("compression", "gzip") - // .json(eventsPath); }); } - private static Dataset expandResultsWithRelations( + private static Dataset prepareSimpleEntities( final SparkSession spark, final String graphPath, final Class sourceClass) { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java index 8bcea5e6e..15a1ddd88 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java @@ -14,6 +14,10 @@ public class ClusterUtils { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + public static void createDirIfMissing(final SparkSession spark, final String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } + public static void removeDir(final SparkSession spark, final String path) { HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); } diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_relations.json b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/common_params.json similarity index 64% rename from dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_relations.json rename to dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/common_params.json index 32fd1d8f3..adee1888a 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_relations.json +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/common_params.json @@ -7,8 +7,8 @@ }, { "paramName": "o", - "paramLongName": "relsPath", - "paramDescription": "the path where the generated relations will be stored", + "paramLongName": "workingPath", + "paramDescription": "the path where the temporary data will be stored", "paramRequired": true } ] diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml index ea9aabcfc..74abcd268 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml @@ -6,8 +6,8 @@ the path where the graph is stored - eventsOutputPath - the path where the the events will be stored + workingPath + the path where the the generated data will be stored isLookupUrl @@ -73,18 +73,35 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + + + + + + + + + + + + + + + + + + yarn cluster - GenerateEvents - eu.dnetlib.dhp.broker.oa.GenerateEventsApplication + PrepareSimpleEntititiesJob + eu.dnetlib.dhp.broker.oa.PrepareSimpleEntititiesJob dhp-broker-events-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -97,7 +114,183 @@ --conf spark.sql.shuffle.partitions=3840 --graphPath${graphInputPath} - --eventsPath${eventsOutputPath} + --workingPath${workingPath} + + + + + + + + + yarn + cluster + PrepareRelatedDatasetsJob + eu.dnetlib.dhp.broker.oa.PrepareRelatedDatasetsJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + + yarn + cluster + PrepareRelatedProjectsJob + eu.dnetlib.dhp.broker.oa.PrepareRelatedProjectsJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + + yarn + cluster + PrepareRelatedPublicationsJob + eu.dnetlib.dhp.broker.oa.PrepareRelatedPublicationsJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + + yarn + cluster + PrepareRelatedSoftwaresJob + eu.dnetlib.dhp.broker.oa.PrepareRelatedSoftwaresJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + + + yarn + cluster + JoinEntitiesJob + eu.dnetlib.dhp.broker.oa.JoinEntitiesJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + + yarn + cluster + PrepareGroupsJob + eu.dnetlib.dhp.broker.oa.PrepareGroupsJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + + yarn + cluster + GenerateEventsJob + eu.dnetlib.dhp.broker.oa.GenerateEventsJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} --isLookupUrl${isLookupUrl} --dedupConfProfile${dedupConfProfId} @@ -105,6 +298,9 @@ + + + diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_broker_events.json b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json similarity index 94% rename from dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_broker_events.json rename to dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json index 6ab6d9a2d..d185bc73d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_broker_events.json +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json @@ -7,7 +7,7 @@ }, { "paramName": "o", - "paramLongName": "eventsPath", + "paramLongName": "workingPath", "paramDescription": "the path where the generated events will be stored", "paramRequired": true }, diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_simple_entities.json b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_simple_entities.json deleted file mode 100644 index 6f5e330f6..000000000 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_simple_entities.json +++ /dev/null @@ -1,14 +0,0 @@ -[ - { - "paramName": "g", - "paramLongName": "graphPath", - "paramDescription": "the path where there the graph is stored", - "paramRequired": true - }, - { - "paramName": "o", - "paramLongName": "simpleEntitiesPath", - "paramDescription": "the path where the generated simple entities (without relations) will be stored", - "paramRequired": true - } -] From 3ce20c198ebd38b8c66c162bd4884986d4fc4e78 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Mon, 22 Jun 2020 12:14:25 +0200 Subject: [PATCH 12/37] reformatting --- .../dhp/broker/oa/generate_all/oozie_app/workflow.xml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml index 74abcd268..9783fcab6 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml @@ -79,7 +79,6 @@ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - @@ -120,7 +119,6 @@ - yarn @@ -145,7 +143,6 @@ - yarn @@ -170,7 +167,6 @@ - yarn @@ -195,7 +191,6 @@ - yarn @@ -246,7 +241,6 @@ - yarn @@ -271,7 +265,6 @@ - yarn @@ -297,10 +290,6 @@ - - - - From e162ba5075e43b09b784000f513266e4c479549c Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 22 Jun 2020 14:12:28 +0200 Subject: [PATCH 13/37] added dnet workflows to orchestrate the execution of graph2hive, updateSolr and updateStats oozie wfs --- .../{provision.xml => graph_construction.xml} | 2 +- .../dhp/wf/profiles/graph_to_hiveDB.xml | 73 ++++++++++++++ .../dnetlib/dhp/wf/profiles/update_solr.xml | 98 +++++++++++++++++++ .../dnetlib/dhp/wf/profiles/update_stats.xml | 74 ++++++++++++++ 4 files changed, 246 insertions(+), 1 deletion(-) rename dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/{provision.xml => graph_construction.xml} (99%) create mode 100644 dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_to_hiveDB.xml create mode 100644 dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/update_solr.xml create mode 100644 dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/update_stats.xml diff --git a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/provision.xml b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_construction.xml similarity index 99% rename from dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/provision.xml rename to dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_construction.xml index 28cbde70d..819b3e12d 100644 --- a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/provision.xml +++ b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_construction.xml @@ -7,7 +7,7 @@ - Data Provision [OCEAN] + Graph Construction [OCEAN] Data Provision 30 diff --git a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_to_hiveDB.xml b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_to_hiveDB.xml new file mode 100644 index 000000000..0ace12ea3 --- /dev/null +++ b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_to_hiveDB.xml @@ -0,0 +1,73 @@ + +
+ + + + + +
+ + Graph to HiveDB [OCEAN] + Data Provision + 30 + + + Set the path containing the AGGREGATOR graph + + inputPath + + + + + + + + Set the target path to store the RAW graph + + hiveDbName + + + + + + + + + wait configurations + + + + + + + create the AGGREGATOR graph + + executeOozieJob + IIS + + { + 'inputPath' : 'inputPath', + 'hiveDbName' : 'hiveDbName' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/graph/hive/oozie_app' + } + + build-report + + + + + + + + + wf_20200615_163630_609 + 2020-06-15T17:08:00+00:00 + SUCCESS + + + +
\ No newline at end of file diff --git a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/update_solr.xml b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/update_solr.xml new file mode 100644 index 000000000..8a7738bcf --- /dev/null +++ b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/update_solr.xml @@ -0,0 +1,98 @@ + +
+ + + + + +
+ + Update Solr [OCEAN] + Data Provision + 30 + + + Set the path containing the AGGREGATOR graph + + inputGraphRootPath + + + + + + + + Set the target path to store the RAW graph + + format + TMF + + + + + + + Set the lookup address + + isLookupUrl + http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl + + + + + + + + wait configurations + + + + + + + create the AGGREGATOR graph + + executeOozieJob + IIS + + { + 'inputGraphRootPath' : 'inputGraphRootPath', + 'isLookupUrl' : 'isLookupUrl', + 'format' : 'format' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/provision/oozie_app', + 'maxRelations' : '100', + 'relPartitions' : '3000', + 'batchSize' : '2000', + 'relationFilter' : 'isAuthorInstitutionOf,produces,hasAmongTopNSimilarDocuments', + 'otherDsTypeId' : 'scholarcomminfra,infospace,pubsrepository::mock,entityregistry,entityregistry::projects,entityregistry::repositories,websource', + 'resumeFrom' : 'prepare_relations', + 'sparkDriverMemoryForJoining' : '3G', + 'sparkExecutorMemoryForJoining' : '7G', + 'sparkExecutorCoresForJoining' : '4', + 'sparkDriverMemoryForIndexing' : '2G', + 'sparkExecutorMemoryForIndexing' : '2G', + 'sparkExecutorCoresForIndexing' : '64', + 'sparkNetworkTimeout' : '600', + 'workingDir' : '/tmp/beta_provision/working_dir/update_solr' + } + + build-report + + + + + + + + + wf_20200615_163630_609 + 2020-06-15T17:08:00+00:00 + SUCCESS + + + +
\ No newline at end of file diff --git a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/update_stats.xml b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/update_stats.xml new file mode 100644 index 000000000..a91b6302e --- /dev/null +++ b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/update_stats.xml @@ -0,0 +1,74 @@ + +
+ + + + + +
+ + Update Stats [OCEAN] + Data Provision + 30 + + + Set the path containing the AGGREGATOR graph + + openaire_db_name + + + + + + + + Set the target path to store the RAW graph + + stats_db_name + + + + + + + + + wait configurations + + + + + + + create the AGGREGATOR graph + + executeOozieJob + IIS + + { + 'openaire_db_name' : 'openaire_db_name', + 'stats_db_name' : 'stats_db_name' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/graph/stats/oozie_app', + 'hive_timeout' : '3000' + } + + build-report + + + + + + + + + wf_20200615_163630_609 + 2020-06-15T17:08:00+00:00 + SUCCESS + + + +
\ No newline at end of file From af2f7705fc631c7dbd7d0f7918f21e525e507ab2 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 23 Jun 2020 08:37:35 +0200 Subject: [PATCH 14/37] partial refactoring of some joins --- .../broker/oa/PrepareRelatedDatasetsJob.java | 16 +- .../broker/oa/PrepareRelatedProjectsJob.java | 4 +- .../oa/PrepareRelatedPublicationsJob.java | 29 +-- .../broker/oa/PrepareRelatedSoftwaresJob.java | 28 ++- .../broker/oa/PrepareSimpleEntititiesJob.java | 1 + .../dhp/broker/oa/util/ClusterUtils.java | 4 + .../oa/generate_all/oozie_app/workflow.xml | 9 +- .../oa/partial/oozie_app/config-default.xml | 18 ++ .../broker/oa/partial/oozie_app/workflow.xml | 215 ++++++++++++++++++ 9 files changed, 289 insertions(+), 35 deletions(-) create mode 100644 dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java index edf9b9a7e..bcd333d56 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java @@ -13,6 +13,7 @@ import org.apache.spark.sql.SaveMode; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.broker.objects.OaBrokerRelatedDataset; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; @@ -52,18 +53,23 @@ public class PrepareRelatedDatasetsJob { ClusterUtils.removeDir(spark, relsPath); - final Dataset datasets = ClusterUtils - .readPath(spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class); + final Dataset datasets = ClusterUtils + .readPath(spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class) + .filter(d -> !ClusterUtils.isDedupRoot(d.getId())) + .map(ConversionUtils::oafDatasetToBrokerDataset, Encoders.bean(OaBrokerRelatedDataset.class)); - final Dataset rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class); + final Dataset rels = ClusterUtils + .readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> !ClusterUtils.isDedupRoot(r.getSource())) + .filter(r -> !ClusterUtils.isDedupRoot(r.getTarget())); rels - .joinWith(datasets, datasets.col("id").equalTo(rels.col("target")), "inner") + .joinWith(datasets, datasets.col("openaireId").equalTo(rels.col("target")), "inner") .map( t -> new RelatedDataset( t._1.getSource(), t._1.getRelType(), - ConversionUtils.oafDatasetToBrokerDataset(t._2)), + t._2), Encoders.bean(RelatedDataset.class)) .write() .mode(SaveMode.Overwrite) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java index 00957972a..0460bfabb 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java @@ -62,7 +62,9 @@ public class PrepareRelatedProjectsJob { final Dataset rels = ClusterUtils .readPath(spark, graphPath + "/relation", Relation.class) - .filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT)); + .filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT)) + .filter(r -> !ClusterUtils.isDedupRoot(r.getSource())) + .filter(r -> !ClusterUtils.isDedupRoot(r.getTarget())); rels .joinWith(projects, projects.col("id").equalTo(rels.col("target")), "inner") diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java index 945fd9ed7..f3db509bb 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java @@ -15,6 +15,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.broker.objects.OaBrokerRelatedPublication; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; @@ -31,9 +32,8 @@ public class PrepareRelatedPublicationsJob { public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils - .toString( - PrepareRelatedPublicationsJob.class - .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + .toString(PrepareRelatedPublicationsJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); parser.parseArgument(args); final Boolean isSparkSessionManaged = Optional @@ -57,19 +57,22 @@ public class PrepareRelatedPublicationsJob { ClusterUtils.removeDir(spark, relsPath); - final Dataset pubs = ClusterUtils - .readPath(spark, graphPath + "/publication", Publication.class); + final Dataset pubs = ClusterUtils + .readPath(spark, graphPath + "/publication", Publication.class) + .filter(p -> !ClusterUtils.isDedupRoot(p.getId())) + .map(ConversionUtils::oafPublicationToBrokerPublication, Encoders.bean(OaBrokerRelatedPublication.class)); - final Dataset rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class); + final Dataset rels = ClusterUtils + .readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> !ClusterUtils.isDedupRoot(r.getSource())) + .filter(r -> !ClusterUtils.isDedupRoot(r.getTarget())); rels - .joinWith(pubs, pubs.col("id").equalTo(rels.col("target")), "inner") - .map( - t -> new RelatedPublication( - t._1.getSource(), - t._1.getRelType(), - ConversionUtils.oafPublicationToBrokerPublication(t._2)), - Encoders.bean(RelatedPublication.class)) + .joinWith(pubs, pubs.col("openaireId").equalTo(rels.col("target")), "inner") + .map(t -> new RelatedPublication( + t._1.getSource(), + t._1.getRelType(), + t._2), Encoders.bean(RelatedPublication.class)) .write() .mode(SaveMode.Overwrite) .json(relsPath); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java index edb8dc1c3..ffc3a8c65 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java @@ -15,6 +15,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; @@ -31,9 +32,8 @@ public class PrepareRelatedSoftwaresJob { public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils - .toString( - PrepareRelatedSoftwaresJob.class - .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + .toString(PrepareRelatedSoftwaresJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); parser.parseArgument(args); final Boolean isSparkSessionManaged = Optional @@ -57,18 +57,22 @@ public class PrepareRelatedSoftwaresJob { ClusterUtils.removeDir(spark, relsPath); - final Dataset softwares = ClusterUtils.readPath(spark, graphPath + "/software", Software.class); + final Dataset softwares = ClusterUtils + .readPath(spark, graphPath + "/software", Software.class) + .filter(sw -> !ClusterUtils.isDedupRoot(sw.getId())) + .map(ConversionUtils::oafSoftwareToBrokerSoftware, Encoders.bean(OaBrokerRelatedSoftware.class)); - final Dataset rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class); + final Dataset rels = ClusterUtils + .readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> !ClusterUtils.isDedupRoot(r.getSource())) + .filter(r -> !ClusterUtils.isDedupRoot(r.getTarget())); rels - .joinWith(softwares, softwares.col("id").equalTo(rels.col("target")), "inner") - .map( - t -> new RelatedSoftware( - t._1.getSource(), - t._1.getRelType(), - ConversionUtils.oafSoftwareToBrokerSoftware(t._2)), - Encoders.bean(RelatedSoftware.class)) + .joinWith(softwares, softwares.col("openaireId").equalTo(rels.col("target")), "inner") + .map(t -> new RelatedSoftware( + t._1.getSource(), + t._1.getRelType(), + t._2), Encoders.bean(RelatedSoftware.class)) .write() .mode(SaveMode.Overwrite) .json(relsPath); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java index 213003db2..1b9c279fd 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java @@ -74,6 +74,7 @@ public class PrepareSimpleEntititiesJob { return ClusterUtils .readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass) + .filter(r -> !ClusterUtils.isDedupRoot(r.getId())) .filter(r -> r.getDataInfo().getDeletedbyinference()) .map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OaBrokerMainEntity.class)); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java index 15a1ddd88..968bde881 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java @@ -32,4 +32,8 @@ public class ClusterUtils { .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); } + public static boolean isDedupRoot(final String id) { + return id.contains("dedup_wf_"); + } + } diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml index 9783fcab6..bec6f221d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml @@ -88,11 +88,11 @@
- + + - @@ -119,6 +119,7 @@ + yarn @@ -190,7 +191,7 @@ - + yarn @@ -214,7 +215,7 @@ - + diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/config-default.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/config-default.xml new file mode 100644 index 000000000..2e0ed9aee --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml new file mode 100644 index 000000000..253910595 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml @@ -0,0 +1,215 @@ + + + + + graphInputPath + the path where the graph is stored + + + workingPath + the path where the the generated data will be stored + + + isLookupUrl + the address of the lookUp service + + + dedupConfProfId + the id of a valid Dedup Configuration Profile + + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + yarn + cluster + PrepareRelatedPublicationsJob + eu.dnetlib.dhp.broker.oa.PrepareRelatedPublicationsJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + + yarn + cluster + PrepareRelatedDatasetsJob + eu.dnetlib.dhp.broker.oa.PrepareRelatedDatasetsJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + yarn + cluster + JoinEntitiesJob + eu.dnetlib.dhp.broker.oa.JoinEntitiesJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + yarn + cluster + PrepareGroupsJob + eu.dnetlib.dhp.broker.oa.PrepareGroupsJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + yarn + cluster + GenerateEventsJob + eu.dnetlib.dhp.broker.oa.GenerateEventsJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + --isLookupUrl${isLookupUrl} + --dedupConfProfile${dedupConfProfId} + + + + + + + + \ No newline at end of file From c3286f4c37d8e828ae00240a45e933fc03d9fabc Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 23 Jun 2020 09:32:32 +0200 Subject: [PATCH 15/37] fixed relType --- .../broker/oa/PrepareRelatedDatasetsJob.java | 15 ++++++++----- .../broker/oa/PrepareRelatedProjectsJob.java | 17 +++++++------- .../oa/PrepareRelatedPublicationsJob.java | 22 +++++++++++++------ .../broker/oa/PrepareRelatedSoftwaresJob.java | 14 +++++++----- .../aggregators/withRels/RelatedDataset.java | 13 ++--------- .../aggregators/withRels/RelatedProject.java | 12 +--------- .../withRels/RelatedPublication.java | 13 +---------- .../aggregators/withRels/RelatedSoftware.java | 13 ++--------- 8 files changed, 47 insertions(+), 72 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java index bcd333d56..110f5f317 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java @@ -15,9 +15,11 @@ import org.slf4j.LoggerFactory; import eu.dnetlib.broker.objects.OaBrokerRelatedDataset; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDataset; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Relation; public class PrepareRelatedDatasetsJob { @@ -60,17 +62,18 @@ public class PrepareRelatedDatasetsJob { final Dataset rels = ClusterUtils .readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT)) + .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) .filter(r -> !ClusterUtils.isDedupRoot(r.getSource())) .filter(r -> !ClusterUtils.isDedupRoot(r.getTarget())); rels .joinWith(datasets, datasets.col("openaireId").equalTo(rels.col("target")), "inner") - .map( - t -> new RelatedDataset( - t._1.getSource(), - t._1.getRelType(), - t._2), - Encoders.bean(RelatedDataset.class)) + .map(t -> { + final RelatedDataset rel = new RelatedDataset(t._1.getSource(), t._2); + rel.getRelDataset().setRelType(t._1.getRelClass()); + return rel; + }, Encoders.bean(RelatedDataset.class)) .write() .mode(SaveMode.Overwrite) .json(relsPath); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java index 0460bfabb..3ae240982 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java @@ -15,7 +15,9 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.broker.objects.OaBrokerProject; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject; @@ -58,22 +60,21 @@ public class PrepareRelatedProjectsJob { ClusterUtils.removeDir(spark, relsPath); - final Dataset projects = ClusterUtils.readPath(spark, graphPath + "/project", Project.class); + final Dataset projects = ClusterUtils + .readPath(spark, graphPath + "/project", Project.class) + .filter(p -> !ClusterUtils.isDedupRoot(p.getId())) + .map(ConversionUtils::oafProjectToBrokerProject, Encoders.bean(OaBrokerProject.class)); final Dataset rels = ClusterUtils .readPath(spark, graphPath + "/relation", Relation.class) .filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT)) + .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) .filter(r -> !ClusterUtils.isDedupRoot(r.getSource())) .filter(r -> !ClusterUtils.isDedupRoot(r.getTarget())); rels - .joinWith(projects, projects.col("id").equalTo(rels.col("target")), "inner") - .map( - t -> new RelatedProject( - t._1.getSource(), - t._1.getRelType(), - ConversionUtils.oafProjectToBrokerProject(t._2)), - Encoders.bean(RelatedProject.class)) + .joinWith(projects, projects.col("openaireId").equalTo(rels.col("target")), "inner") + .map(t -> new RelatedProject(t._1.getSource(), t._2), Encoders.bean(RelatedProject.class)) .write() .mode(SaveMode.Overwrite) .json(relsPath); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java index f3db509bb..17e078c2c 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java @@ -17,9 +17,11 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.broker.objects.OaBrokerRelatedPublication; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublication; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Relation; @@ -32,8 +34,9 @@ public class PrepareRelatedPublicationsJob { public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils - .toString(PrepareRelatedPublicationsJob.class - .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + .toString( + PrepareRelatedPublicationsJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); parser.parseArgument(args); final Boolean isSparkSessionManaged = Optional @@ -60,19 +63,24 @@ public class PrepareRelatedPublicationsJob { final Dataset pubs = ClusterUtils .readPath(spark, graphPath + "/publication", Publication.class) .filter(p -> !ClusterUtils.isDedupRoot(p.getId())) - .map(ConversionUtils::oafPublicationToBrokerPublication, Encoders.bean(OaBrokerRelatedPublication.class)); + .map( + ConversionUtils::oafPublicationToBrokerPublication, + Encoders.bean(OaBrokerRelatedPublication.class)); final Dataset rels = ClusterUtils .readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT)) + .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) .filter(r -> !ClusterUtils.isDedupRoot(r.getSource())) .filter(r -> !ClusterUtils.isDedupRoot(r.getTarget())); rels .joinWith(pubs, pubs.col("openaireId").equalTo(rels.col("target")), "inner") - .map(t -> new RelatedPublication( - t._1.getSource(), - t._1.getRelType(), - t._2), Encoders.bean(RelatedPublication.class)) + .map(t -> { + final RelatedPublication rel = new RelatedPublication(t._1.getSource(), t._2); + rel.getRelPublication().setRelType(t._1.getRelClass()); + return rel; + }, Encoders.bean(RelatedPublication.class)) .write() .mode(SaveMode.Overwrite) .json(relsPath); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java index ffc3a8c65..0704fb44a 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java @@ -17,9 +17,11 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Software; @@ -32,8 +34,9 @@ public class PrepareRelatedSoftwaresJob { public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils - .toString(PrepareRelatedSoftwaresJob.class - .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + .toString( + PrepareRelatedSoftwaresJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); parser.parseArgument(args); final Boolean isSparkSessionManaged = Optional @@ -64,15 +67,14 @@ public class PrepareRelatedSoftwaresJob { final Dataset rels = ClusterUtils .readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT)) + .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) .filter(r -> !ClusterUtils.isDedupRoot(r.getSource())) .filter(r -> !ClusterUtils.isDedupRoot(r.getTarget())); rels .joinWith(softwares, softwares.col("openaireId").equalTo(rels.col("target")), "inner") - .map(t -> new RelatedSoftware( - t._1.getSource(), - t._1.getRelType(), - t._2), Encoders.bean(RelatedSoftware.class)) + .map(t -> new RelatedSoftware(t._1.getSource(), t._2), Encoders.bean(RelatedSoftware.class)) .write() .mode(SaveMode.Overwrite) .json(relsPath); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java index daf75ea2e..0925e3291 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java @@ -11,16 +11,15 @@ public class RelatedDataset implements Serializable { * */ private static final long serialVersionUID = 774487705184038324L; + private String source; - private String relType; private OaBrokerRelatedDataset relDataset; public RelatedDataset() { } - public RelatedDataset(final String source, final String relType, final OaBrokerRelatedDataset relDataset) { + public RelatedDataset(final String source, final OaBrokerRelatedDataset relDataset) { this.source = source; - this.relType = relType; this.relDataset = relDataset; } @@ -32,14 +31,6 @@ public class RelatedDataset implements Serializable { this.source = source; } - public String getRelType() { - return relType; - } - - public void setRelType(final String relType) { - this.relType = relType; - } - public OaBrokerRelatedDataset getRelDataset() { return relDataset; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java index 4116c8c77..74d19fe9d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java @@ -13,15 +13,13 @@ public class RelatedProject implements Serializable { private static final long serialVersionUID = 4941437626549329870L; private String source; - private String relType; private OaBrokerProject relProject; public RelatedProject() { } - public RelatedProject(final String source, final String relType, final OaBrokerProject relProject) { + public RelatedProject(final String source, final OaBrokerProject relProject) { this.source = source; - this.relType = relType; this.relProject = relProject; } @@ -33,14 +31,6 @@ public class RelatedProject implements Serializable { this.source = source; } - public String getRelType() { - return relType; - } - - public void setRelType(final String relType) { - this.relType = relType; - } - public OaBrokerProject getRelProject() { return relProject; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java index 9e222a952..ed6aeeab1 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java @@ -13,16 +13,13 @@ public class RelatedPublication implements Serializable { private static final long serialVersionUID = 9021609640411395128L; private String source; - private String relType; private OaBrokerRelatedPublication relPublication; public RelatedPublication() { } - public RelatedPublication(final String source, final String relType, - final OaBrokerRelatedPublication relPublication) { + public RelatedPublication(final String source, final OaBrokerRelatedPublication relPublication) { this.source = source; - this.relType = relType; this.relPublication = relPublication; } @@ -34,14 +31,6 @@ public class RelatedPublication implements Serializable { this.source = source; } - public String getRelType() { - return relType; - } - - public void setRelType(final String relType) { - this.relType = relType; - } - public OaBrokerRelatedPublication getRelPublication() { return relPublication; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java index 2f3b8668c..0aa3a4045 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java @@ -11,16 +11,15 @@ public class RelatedSoftware implements Serializable { * */ private static final long serialVersionUID = 7573383356943300157L; + private String source; - private String relType; private OaBrokerRelatedSoftware relSoftware; public RelatedSoftware() { } - public RelatedSoftware(final String source, final String relType, final OaBrokerRelatedSoftware relSoftware) { + public RelatedSoftware(final String source, final OaBrokerRelatedSoftware relSoftware) { this.source = source; - this.relType = relType; this.relSoftware = relSoftware; } @@ -32,14 +31,6 @@ public class RelatedSoftware implements Serializable { this.source = source; } - public String getRelType() { - return relType; - } - - public void setRelType(final String relType) { - this.relType = relType; - } - public OaBrokerRelatedSoftware getRelSoftware() { return relSoftware; } From 38bb45d0b6cd28cc8c73f89b681b7def19877c47 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 23 Jun 2020 10:14:39 +0200 Subject: [PATCH 16/37] test osf:refereed --- .../src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java | 2 ++ .../test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml | 1 + .../test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml | 1 + 3 files changed, 4 insertions(+) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 9bd20303f..b1f0ecf0d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -111,6 +111,7 @@ public class MappersTest { assertNotNull(i.getAccessright()); assertEquals("OPEN", i.getAccessright().getClassid()); }); + assertEquals("0001", p.getInstance().get(0).getRefereed().getClassid()); assertNotNull(p.getBestaccessright()); assertEquals("OPEN", p.getBestaccessright().getClassid()); @@ -217,6 +218,7 @@ public class MappersTest { assertNotNull(i.getAccessright()); assertEquals("OPEN", i.getAccessright().getClassid()); }); + assertEquals("0001", d.getInstance().get(0).getRefereed().getClassid()); assertValidId(r1.getSource()); assertValidId(r1.getTarget()); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml index 2cb0ba1c7..ead22aa96 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml @@ -57,6 +57,7 @@ 10.3897/oneeco.2.e13718 https://oneecosystem.pensoft.net/article/13718/ One Ecosystem + 0001 diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml index 88ae9d106..5525a2753 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml @@ -90,6 +90,7 @@ corda_______::226852 + 0001s Date: Tue, 23 Jun 2020 10:24:15 +0200 Subject: [PATCH 17/37] filter of valid resultResult relations --- .../dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java | 3 +-- .../dhp/broker/oa/PrepareRelatedPublicationsJob.java | 3 +-- .../java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java | 8 ++++++++ 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java index 110f5f317..fe9c87e87 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java @@ -15,7 +15,6 @@ import org.slf4j.LoggerFactory; import eu.dnetlib.broker.objects.OaBrokerRelatedDataset; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDataset; @@ -63,7 +62,7 @@ public class PrepareRelatedDatasetsJob { final Dataset rels = ClusterUtils .readPath(spark, graphPath + "/relation", Relation.class) .filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT)) - .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) + .filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass())) .filter(r -> !ClusterUtils.isDedupRoot(r.getSource())) .filter(r -> !ClusterUtils.isDedupRoot(r.getTarget())); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java index 17e078c2c..8814ef3e0 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java @@ -17,7 +17,6 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.broker.objects.OaBrokerRelatedPublication; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublication; @@ -70,7 +69,7 @@ public class PrepareRelatedPublicationsJob { final Dataset rels = ClusterUtils .readPath(spark, graphPath + "/relation", Relation.class) .filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT)) - .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) + .filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass())) .filter(r -> !ClusterUtils.isDedupRoot(r.getSource())) .filter(r -> !ClusterUtils.isDedupRoot(r.getTarget())); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java index 968bde881..de9b901d0 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java @@ -36,4 +36,12 @@ public class ClusterUtils { return id.contains("dedup_wf_"); } + public static final boolean isValidResultResultClass(final String s) { + return s.equals("isReferencedBy") + || s.equals("isRelatedTo") + || s.equals("references") + || s.equals("isSupplementedBy") + || s.equals("isSupplementedTo"); + } + } From d13e3d3f6896efc23ecac430e629e163cb4615c6 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 23 Jun 2020 11:01:42 +0200 Subject: [PATCH 18/37] fixed paths --- .../dhp/broker/oa/GenerateEventsJob.java | 5 +- .../dhp/broker/oa/JoinEntitiesJob.java | 13 ++-- .../dhp/broker/oa/PrepareGroupsJob.java | 2 +- .../oa/generate_all/oozie_app/workflow.xml | 1 - .../dhp/broker/oa/generate_events.json | 6 -- .../broker/oa/partial/oozie_app/workflow.xml | 59 +------------------ 6 files changed, 8 insertions(+), 78 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java index 3ea0086ff..089fbf6d4 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java @@ -44,9 +44,6 @@ public class GenerateEventsJob { .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String graphPath = parser.get("graphPath"); - log.info("graphPath: {}", graphPath); - final String workingPath = parser.get("workingPath"); log.info("workingPath: {}", workingPath); @@ -70,7 +67,7 @@ public class GenerateEventsJob { ClusterUtils.removeDir(spark, eventsPath); final Dataset groups = ClusterUtils - .readPath(spark, graphPath + "/relation", ResultGroup.class); + .readPath(spark, workingPath + "/relation", ResultGroup.class); final Dataset events = groups .map( diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinEntitiesJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinEntitiesJob.java index dac308f36..da77a4673 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinEntitiesJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinEntitiesJob.java @@ -43,9 +43,6 @@ public class JoinEntitiesJob { .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String graphPath = parser.get("graphPath"); - log.info("graphPath: {}", graphPath); - final String workingPath = parser.get("workingPath"); log.info("workingPath: {}", workingPath); @@ -59,16 +56,16 @@ public class JoinEntitiesJob { ClusterUtils.removeDir(spark, joinedEntitiesPath); final Dataset r0 = ClusterUtils - .readPath(spark, graphPath + "/simpleEntities", OaBrokerMainEntity.class); + .readPath(spark, workingPath + "/simpleEntities", OaBrokerMainEntity.class); final Dataset r1 = join( - r0, ClusterUtils.readPath(spark, graphPath + "/relatedProjects", RelatedProject.class)); + r0, ClusterUtils.readPath(spark, workingPath + "/relatedProjects", RelatedProject.class)); final Dataset r2 = join( - r1, ClusterUtils.readPath(spark, graphPath + "/relatedDatasets", RelatedDataset.class)); + r1, ClusterUtils.readPath(spark, workingPath + "/relatedDatasets", RelatedDataset.class)); final Dataset r3 = join( - r2, ClusterUtils.readPath(spark, graphPath + "/relatedPublications", RelatedPublication.class)); + r2, ClusterUtils.readPath(spark, workingPath + "/relatedPublications", RelatedPublication.class)); final Dataset r4 = join( - r3, ClusterUtils.readPath(spark, graphPath + "/relatedSoftwares", RelatedSoftware.class)); + r3, ClusterUtils.readPath(spark, workingPath + "/relatedSoftwares", RelatedSoftware.class)); r4.write().mode(SaveMode.Overwrite).json(joinedEntitiesPath); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java index aa057eee8..159047dad 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java @@ -58,7 +58,7 @@ public class PrepareGroupsJob { ClusterUtils.removeDir(spark, groupsPath); final Dataset results = ClusterUtils - .readPath(spark, graphPath + "/joinedEntities", OaBrokerMainEntity.class); + .readPath(spark, workingPath + "/joinedEntities", OaBrokerMainEntity.class); final Dataset mergedRels = ClusterUtils .readPath(spark, graphPath + "/relation", Relation.class) diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml index bec6f221d..18e2eedca 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml @@ -283,7 +283,6 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 - --graphPath${graphInputPath} --workingPath${workingPath} --isLookupUrl${isLookupUrl} --dedupConfProfile${dedupConfProfId} diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json index d185bc73d..7ae076159 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json @@ -1,10 +1,4 @@ [ - { - "paramName": "g", - "paramLongName": "graphPath", - "paramDescription": "the path where there the graph is stored", - "paramRequired": true - }, { "paramName": "o", "paramLongName": "workingPath", diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml index 253910595..1ccdef929 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml @@ -73,68 +73,12 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - yarn - cluster - PrepareRelatedPublicationsJob - eu.dnetlib.dhp.broker.oa.PrepareRelatedPublicationsJob - dhp-broker-events-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 - - --graphPath${graphInputPath} - --workingPath${workingPath} - - - - - - - - - yarn - cluster - PrepareRelatedDatasetsJob - eu.dnetlib.dhp.broker.oa.PrepareRelatedDatasetsJob - dhp-broker-events-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 - - --graphPath${graphInputPath} - --workingPath${workingPath} - - - - @@ -201,7 +145,6 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 - --graphPath${graphInputPath} --workingPath${workingPath} --isLookupUrl${isLookupUrl} --dedupConfProfile${dedupConfProfId} From 8b9933b934eac804d50f126411a3f27ee2384182 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Wed, 24 Jun 2020 08:57:13 +0200 Subject: [PATCH 19/37] refactoring aggregators --- .../dhp/broker/oa/GenerateEventsJob.java | 2 +- .../dhp/broker/oa/JoinEntitiesJob.java | 35 +++++---- .../dhp/broker/oa/PrepareGroupsJob.java | 2 +- .../OaBrokerMainEntityAggregator.java | 71 ------------------- .../withRels/RelatedDatasetAggregator.java | 58 +++++++++++++++ .../withRels/RelatedProjectAggregator.java | 58 +++++++++++++++ .../RelatedPublicationAggregator.java | 59 +++++++++++++++ .../withRels/RelatedSoftwareAggregator.java | 58 +++++++++++++++ 8 files changed, 252 insertions(+), 91 deletions(-) delete mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/OaBrokerMainEntityAggregator.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasetAggregator.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProjectAggregator.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublicationAggregator.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftwareAggregator.java diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java index 089fbf6d4..eaeb6d271 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java @@ -67,7 +67,7 @@ public class GenerateEventsJob { ClusterUtils.removeDir(spark, eventsPath); final Dataset groups = ClusterUtils - .readPath(spark, workingPath + "/relation", ResultGroup.class); + .readPath(spark, workingPath + "/duplicates", ResultGroup.class); final Dataset events = groups .map( diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinEntitiesJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinEntitiesJob.java index da77a4673..868faa8f5 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinEntitiesJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinEntitiesJob.java @@ -11,18 +11,15 @@ import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.TypedColumn; +import org.apache.spark.sql.expressions.Aggregator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.OaBrokerMainEntityAggregator; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDataset; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublication; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProjectAggregator; import scala.Tuple2; public class JoinEntitiesJob { @@ -59,31 +56,33 @@ public class JoinEntitiesJob { .readPath(spark, workingPath + "/simpleEntities", OaBrokerMainEntity.class); final Dataset r1 = join( - r0, ClusterUtils.readPath(spark, workingPath + "/relatedProjects", RelatedProject.class)); - final Dataset r2 = join( - r1, ClusterUtils.readPath(spark, workingPath + "/relatedDatasets", RelatedDataset.class)); - final Dataset r3 = join( - r2, ClusterUtils.readPath(spark, workingPath + "/relatedPublications", RelatedPublication.class)); - final Dataset r4 = join( - r3, ClusterUtils.readPath(spark, workingPath + "/relatedSoftwares", RelatedSoftware.class)); + r0, ClusterUtils.readPath(spark, workingPath + "/relatedProjects", RelatedProject.class), + new RelatedProjectAggregator()); + // final Dataset r2 = join( + // r1, ClusterUtils.readPath(spark, workingPath + "/relatedDatasets", RelatedDataset.class), new + // RelatedDatasetAggregator()); + // final Dataset r3 = join( + // r2, ClusterUtils.readPath(spark, workingPath + "/relatedPublications", RelatedPublication.class), new + // RelatedPublicationAggregator()); + // final Dataset r4 = join( + // r3, ClusterUtils.readPath(spark, workingPath + "/relatedSoftwares", RelatedSoftware.class), new + // RelatedSoftwareAggregator()); - r4.write().mode(SaveMode.Overwrite).json(joinedEntitiesPath); + r1.write().mode(SaveMode.Overwrite).json(joinedEntitiesPath); }); } private static Dataset join(final Dataset sources, - final Dataset typedRels) { - - final TypedColumn, OaBrokerMainEntity> aggr = new OaBrokerMainEntityAggregator() - .toColumn(); + final Dataset typedRels, + final Aggregator, OaBrokerMainEntity, OaBrokerMainEntity> aggr) { return sources .joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer") .groupByKey( (MapFunction, String>) t -> t._1.getOpenaireId(), Encoders.STRING()) - .agg(aggr) + .agg(aggr.toColumn()) .map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class)); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java index 159047dad..934ddff59 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java @@ -48,7 +48,7 @@ public class PrepareGroupsJob { final String workingPath = parser.get("workingPath"); log.info("workingPath: {}", workingPath); - final String groupsPath = workingPath + "/groups"; + final String groupsPath = workingPath + "/duplicates"; log.info("groupsPath: {}", groupsPath); final SparkConf conf = new SparkConf(); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/OaBrokerMainEntityAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/OaBrokerMainEntityAggregator.java deleted file mode 100644 index 6a2d9b06d..000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/OaBrokerMainEntityAggregator.java +++ /dev/null @@ -1,71 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; - -import org.apache.spark.sql.Encoder; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.expressions.Aggregator; - -import eu.dnetlib.broker.objects.OaBrokerMainEntity; -import scala.Tuple2; - -public class OaBrokerMainEntityAggregator - extends Aggregator, OaBrokerMainEntity, OaBrokerMainEntity> { - - /** - * - */ - private static final long serialVersionUID = -3687878788861013488L; - - @Override - public OaBrokerMainEntity zero() { - return new OaBrokerMainEntity(); - } - - @Override - public OaBrokerMainEntity finish(final OaBrokerMainEntity g) { - return g; - } - - @Override - public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2 t) { - if (g.getOriginalId() == null) { - return t._1; - } else if (t._2 instanceof RelatedSoftware) { - g.getSoftwares().add(((RelatedSoftware) t._2).getRelSoftware()); - } else if (t._2 instanceof RelatedDataset) { - g.getDatasets().add(((RelatedDataset) t._2).getRelDataset()); - } else if (t._2 instanceof RelatedPublication) { - g.getPublications().add(((RelatedPublication) t._2).getRelPublication()); - } else if (t._2 instanceof RelatedProject) { - g.getProjects().add(((RelatedProject) t._2).getRelProject()); - } else { - throw new RuntimeException("Invalid Object: " + t._2.getClass()); - } - return g; - - } - - @Override - public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { - if (g1.getOriginalId() != null) { - g1.getSoftwares().addAll(g2.getSoftwares()); - g1.getDatasets().addAll(g2.getDatasets()); - g1.getPublications().addAll(g2.getPublications()); - g1.getProjects().addAll(g2.getProjects()); - return g1; - } else { - return g2; - } - } - - @Override - public Encoder bufferEncoder() { - return Encoders.bean(OaBrokerMainEntity.class); - } - - @Override - public Encoder outputEncoder() { - return Encoders.bean(OaBrokerMainEntity.class); - } - -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasetAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasetAggregator.java new file mode 100644 index 000000000..04840afe9 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasetAggregator.java @@ -0,0 +1,58 @@ + +package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; + +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.expressions.Aggregator; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import scala.Tuple2; + +public class RelatedDatasetAggregator + extends Aggregator, OaBrokerMainEntity, OaBrokerMainEntity> { + + /** + * + */ + private static final long serialVersionUID = 6969761680131482557L; + + @Override + public OaBrokerMainEntity zero() { + return new OaBrokerMainEntity(); + } + + @Override + public OaBrokerMainEntity finish(final OaBrokerMainEntity g) { + return g; + } + + @Override + public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2 t) { + final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOriginalId()) ? g : t._1; + res.getDatasets().add(t._2.getRelDataset()); + return res; + + } + + @Override + public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { + if (StringUtils.isNotBlank(g1.getOriginalId())) { + g1.getDatasets().addAll(g2.getDatasets()); + return g1; + } else { + return g2; + } + } + + @Override + public Encoder bufferEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + + @Override + public Encoder outputEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProjectAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProjectAggregator.java new file mode 100644 index 000000000..025cc413a --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProjectAggregator.java @@ -0,0 +1,58 @@ + +package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; + +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.expressions.Aggregator; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import scala.Tuple2; + +public class RelatedProjectAggregator + extends Aggregator, OaBrokerMainEntity, OaBrokerMainEntity> { + + /** + * + */ + private static final long serialVersionUID = 8559808519152275763L; + + @Override + public OaBrokerMainEntity zero() { + return new OaBrokerMainEntity(); + } + + @Override + public OaBrokerMainEntity finish(final OaBrokerMainEntity g) { + return g; + } + + @Override + public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2 t) { + final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOriginalId()) ? g : t._1; + res.getProjects().add(t._2.getRelProject()); + return res; + + } + + @Override + public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { + if (StringUtils.isNotBlank(g1.getOriginalId())) { + g1.getProjects().addAll(g2.getProjects()); + return g1; + } else { + return g2; + } + } + + @Override + public Encoder bufferEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + + @Override + public Encoder outputEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublicationAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublicationAggregator.java new file mode 100644 index 000000000..1b54d4a12 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublicationAggregator.java @@ -0,0 +1,59 @@ + +package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; + +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.expressions.Aggregator; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import scala.Tuple2; + +public class RelatedPublicationAggregator + extends Aggregator, OaBrokerMainEntity, OaBrokerMainEntity> { + + /** + * + */ + private static final long serialVersionUID = 4656934981558135919L; + + @Override + public OaBrokerMainEntity zero() { + return new OaBrokerMainEntity(); + } + + @Override + public OaBrokerMainEntity finish(final OaBrokerMainEntity g) { + return g; + } + + @Override + public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, + final Tuple2 t) { + final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOriginalId()) ? g : t._1; + res.getPublications().add(t._2.getRelPublication()); + return res; + + } + + @Override + public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { + if (StringUtils.isNotBlank(g1.getOriginalId())) { + g1.getPublications().addAll(g2.getPublications()); + return g1; + } else { + return g2; + } + } + + @Override + public Encoder bufferEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + + @Override + public Encoder outputEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftwareAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftwareAggregator.java new file mode 100644 index 000000000..871cc4f06 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftwareAggregator.java @@ -0,0 +1,58 @@ + +package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; + +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.expressions.Aggregator; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import scala.Tuple2; + +public class RelatedSoftwareAggregator + extends Aggregator, OaBrokerMainEntity, OaBrokerMainEntity> { + + /** + * + */ + private static final long serialVersionUID = -8987959389106443702L; + + @Override + public OaBrokerMainEntity zero() { + return new OaBrokerMainEntity(); + } + + @Override + public OaBrokerMainEntity finish(final OaBrokerMainEntity g) { + return g; + } + + @Override + public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2 t) { + final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOriginalId()) ? g : t._1; + res.getSoftwares().add(t._2.getRelSoftware()); + return res; + + } + + @Override + public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { + if (StringUtils.isNotBlank(g1.getOriginalId())) { + g1.getSoftwares().addAll(g2.getSoftwares()); + return g1; + } else { + return g2; + } + } + + @Override + public Encoder bufferEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + + @Override + public Encoder outputEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + +} From e53dd62e879dfe7645ca8f0a4e681f7e126fe090 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Wed, 24 Jun 2020 09:24:45 +0200 Subject: [PATCH 20/37] minot changes --- .../java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java | 2 +- .../aggregators/withRels/RelatedDatasetAggregator.java | 8 +++++--- .../aggregators/withRels/RelatedProjectAggregator.java | 8 +++++--- .../withRels/RelatedPublicationAggregator.java | 8 +++++--- .../aggregators/withRels/RelatedSoftwareAggregator.java | 8 +++++--- 5 files changed, 21 insertions(+), 13 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java index eaeb6d271..dbe2fdd47 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java @@ -53,7 +53,7 @@ public class GenerateEventsJob { final String dedupConfigProfileId = parser.get("dedupConfProfile"); log.info("dedupConfigProfileId: {}", dedupConfigProfileId); - final String eventsPath = workingPath + "/eventsPath"; + final String eventsPath = workingPath + "/events"; log.info("eventsPath: {}", eventsPath); final SparkConf conf = new SparkConf(); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasetAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasetAggregator.java index 04840afe9..a963f073d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasetAggregator.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasetAggregator.java @@ -29,15 +29,17 @@ public class RelatedDatasetAggregator @Override public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2 t) { - final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOriginalId()) ? g : t._1; - res.getDatasets().add(t._2.getRelDataset()); + final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1; + if (t._2 != null) { + res.getDatasets().add(t._2.getRelDataset()); + } return res; } @Override public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { - if (StringUtils.isNotBlank(g1.getOriginalId())) { + if (StringUtils.isNotBlank(g1.getOpenaireId())) { g1.getDatasets().addAll(g2.getDatasets()); return g1; } else { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProjectAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProjectAggregator.java index 025cc413a..3fedb1a32 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProjectAggregator.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProjectAggregator.java @@ -29,15 +29,17 @@ public class RelatedProjectAggregator @Override public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2 t) { - final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOriginalId()) ? g : t._1; - res.getProjects().add(t._2.getRelProject()); + final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1; + if (t._2 != null) { + res.getProjects().add(t._2.getRelProject()); + } return res; } @Override public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { - if (StringUtils.isNotBlank(g1.getOriginalId())) { + if (StringUtils.isNotBlank(g1.getOpenaireId())) { g1.getProjects().addAll(g2.getProjects()); return g1; } else { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublicationAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublicationAggregator.java index 1b54d4a12..b331599ad 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublicationAggregator.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublicationAggregator.java @@ -30,15 +30,17 @@ public class RelatedPublicationAggregator @Override public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2 t) { - final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOriginalId()) ? g : t._1; - res.getPublications().add(t._2.getRelPublication()); + final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1; + if (t._2 != null) { + res.getPublications().add(t._2.getRelPublication()); + } return res; } @Override public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { - if (StringUtils.isNotBlank(g1.getOriginalId())) { + if (StringUtils.isNotBlank(g1.getOpenaireId())) { g1.getPublications().addAll(g2.getPublications()); return g1; } else { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftwareAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftwareAggregator.java index 871cc4f06..d3b1c3407 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftwareAggregator.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftwareAggregator.java @@ -29,15 +29,17 @@ public class RelatedSoftwareAggregator @Override public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2 t) { - final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOriginalId()) ? g : t._1; - res.getSoftwares().add(t._2.getRelSoftware()); + final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1; + if (t._2 != null) { + res.getSoftwares().add(t._2.getRelSoftware()); + } return res; } @Override public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { - if (StringUtils.isNotBlank(g1.getOriginalId())) { + if (StringUtils.isNotBlank(g1.getOpenaireId())) { g1.getSoftwares().addAll(g2.getSoftwares()); return g1; } else { From 202f6e62ff0002573ff69be4dbe570b7b6569989 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Wed, 24 Jun 2020 15:47:06 +0200 Subject: [PATCH 21/37] Splitted join wf --- ...JoinEntitiesJob.java => JoinStep1Job.java} | 53 +++++------- .../dnetlib/dhp/broker/oa/JoinStep2Job.java | 79 ++++++++++++++++++ .../dnetlib/dhp/broker/oa/JoinStep3Job.java | 79 ++++++++++++++++++ .../dnetlib/dhp/broker/oa/JoinStep4Job.java | 79 ++++++++++++++++++ .../dhp/broker/oa/PrepareGroupsJob.java | 4 +- .../broker/oa/PrepareRelatedDatasetsJob.java | 1 + .../broker/oa/PrepareRelatedProjectsJob.java | 5 +- .../oa/PrepareRelatedPublicationsJob.java | 5 +- .../broker/oa/PrepareRelatedSoftwaresJob.java | 5 +- .../oa/generate_all/oozie_app/workflow.xml | 80 ++++++++++++++++++- .../broker/oa/partial/oozie_app/workflow.xml | 55 +++++++++---- 11 files changed, 379 insertions(+), 66 deletions(-) rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/{JoinEntitiesJob.java => JoinStep1Job.java} (52%) create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep2Job.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep3Job.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep4Job.java diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinEntitiesJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep1Job.java similarity index 52% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinEntitiesJob.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep1Job.java index 868faa8f5..1be782a12 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinEntitiesJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep1Job.java @@ -11,7 +11,7 @@ import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.expressions.Aggregator; +import org.apache.spark.sql.TypedColumn; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -22,15 +22,15 @@ import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProjectAggregator; import scala.Tuple2; -public class JoinEntitiesJob { +public class JoinStep1Job { - private static final Logger log = LoggerFactory.getLogger(JoinEntitiesJob.class); + private static final Logger log = LoggerFactory.getLogger(JoinStep1Job.class); public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils .toString( - JoinEntitiesJob.class + JoinStep1Job.class .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); parser.parseArgument(args); @@ -43,7 +43,7 @@ public class JoinEntitiesJob { final String workingPath = parser.get("workingPath"); log.info("workingPath: {}", workingPath); - final String joinedEntitiesPath = workingPath + "/joinedEntities"; + final String joinedEntitiesPath = workingPath + "/joinedEntities_step1"; log.info("joinedEntitiesPath: {}", joinedEntitiesPath); final SparkConf conf = new SparkConf(); @@ -52,39 +52,28 @@ public class JoinEntitiesJob { ClusterUtils.removeDir(spark, joinedEntitiesPath); - final Dataset r0 = ClusterUtils + final Dataset sources = ClusterUtils .readPath(spark, workingPath + "/simpleEntities", OaBrokerMainEntity.class); - final Dataset r1 = join( - r0, ClusterUtils.readPath(spark, workingPath + "/relatedProjects", RelatedProject.class), - new RelatedProjectAggregator()); - // final Dataset r2 = join( - // r1, ClusterUtils.readPath(spark, workingPath + "/relatedDatasets", RelatedDataset.class), new - // RelatedDatasetAggregator()); - // final Dataset r3 = join( - // r2, ClusterUtils.readPath(spark, workingPath + "/relatedPublications", RelatedPublication.class), new - // RelatedPublicationAggregator()); - // final Dataset r4 = join( - // r3, ClusterUtils.readPath(spark, workingPath + "/relatedSoftwares", RelatedSoftware.class), new - // RelatedSoftwareAggregator()); + final Dataset typedRels = ClusterUtils + .readPath(spark, workingPath + "/relatedProjects", RelatedProject.class); - r1.write().mode(SaveMode.Overwrite).json(joinedEntitiesPath); + final TypedColumn, OaBrokerMainEntity> aggr = new RelatedProjectAggregator() + .toColumn(); + + sources + .joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer") + .groupByKey( + (MapFunction, String>) t -> t._1.getOpenaireId(), + Encoders.STRING()) + .agg(aggr) + .map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class)) + .write() + .mode(SaveMode.Overwrite) + .json(joinedEntitiesPath); }); } - private static Dataset join(final Dataset sources, - final Dataset typedRels, - final Aggregator, OaBrokerMainEntity, OaBrokerMainEntity> aggr) { - - return sources - .joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer") - .groupByKey( - (MapFunction, String>) t -> t._1.getOpenaireId(), Encoders.STRING()) - .agg(aggr.toColumn()) - .map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class)); - - } - } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep2Job.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep2Job.java new file mode 100644 index 000000000..103d79553 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep2Job.java @@ -0,0 +1,79 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.TypedColumn; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftwareAggregator; +import scala.Tuple2; + +public class JoinStep2Job { + + private static final Logger log = LoggerFactory.getLogger(JoinStep2Job.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + JoinStep2Job.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String joinedEntitiesPath = workingPath + "/joinedEntities_step2"; + log.info("joinedEntitiesPath: {}", joinedEntitiesPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, joinedEntitiesPath); + + final Dataset sources = ClusterUtils + .readPath(spark, workingPath + "/joinedEntities_step1", OaBrokerMainEntity.class); + + final Dataset typedRels = ClusterUtils + .readPath(spark, workingPath + "/relatedSoftwares", RelatedSoftware.class); + + final TypedColumn, OaBrokerMainEntity> aggr = new RelatedSoftwareAggregator() + .toColumn(); + + sources + .joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer") + .groupByKey( + (MapFunction, String>) t -> t._1.getOpenaireId(), + Encoders.STRING()) + .agg(aggr) + .map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class)) + .write() + .mode(SaveMode.Overwrite) + .json(joinedEntitiesPath); + + }); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep3Job.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep3Job.java new file mode 100644 index 000000000..ceb199dc4 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep3Job.java @@ -0,0 +1,79 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.TypedColumn; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDataset; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDatasetAggregator; +import scala.Tuple2; + +public class JoinStep3Job { + + private static final Logger log = LoggerFactory.getLogger(JoinStep3Job.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + JoinStep3Job.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String joinedEntitiesPath = workingPath + "/joinedEntities_step3"; + log.info("joinedEntitiesPath: {}", joinedEntitiesPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, joinedEntitiesPath); + + final Dataset sources = ClusterUtils + .readPath(spark, workingPath + "/joinedEntities_step2", OaBrokerMainEntity.class); + + final Dataset typedRels = ClusterUtils + .readPath(spark, workingPath + "/relatedDatasets", RelatedDataset.class); + + final TypedColumn, OaBrokerMainEntity> aggr = new RelatedDatasetAggregator() + .toColumn(); + + sources + .joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer") + .groupByKey( + (MapFunction, String>) t -> t._1.getOpenaireId(), + Encoders.STRING()) + .agg(aggr) + .map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class)) + .write() + .mode(SaveMode.Overwrite) + .json(joinedEntitiesPath); + + }); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep4Job.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep4Job.java new file mode 100644 index 000000000..3067810dd --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep4Job.java @@ -0,0 +1,79 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.TypedColumn; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublication; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublicationAggregator; +import scala.Tuple2; + +public class JoinStep4Job { + + private static final Logger log = LoggerFactory.getLogger(JoinStep4Job.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + JoinStep4Job.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String joinedEntitiesPath = workingPath + "/joinedEntities_step4"; + log.info("joinedEntitiesPath: {}", joinedEntitiesPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, joinedEntitiesPath); + + final Dataset sources = ClusterUtils + .readPath(spark, workingPath + "/joinedEntities_step3", OaBrokerMainEntity.class); + + final Dataset typedRels = ClusterUtils + .readPath(spark, workingPath + "/relatedPublications", RelatedPublication.class); + + final TypedColumn, OaBrokerMainEntity> aggr = new RelatedPublicationAggregator() + .toColumn(); + + sources + .joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer") + .groupByKey( + (MapFunction, String>) t -> t._1.getOpenaireId(), + Encoders.STRING()) + .agg(aggr) + .map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class)) + .write() + .mode(SaveMode.Overwrite) + .json(joinedEntitiesPath); + + }); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java index 934ddff59..47a9f36c5 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java @@ -32,7 +32,7 @@ public class PrepareGroupsJob { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils .toString( - JoinEntitiesJob.class + PrepareGroupsJob.class .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); parser.parseArgument(args); @@ -58,7 +58,7 @@ public class PrepareGroupsJob { ClusterUtils.removeDir(spark, groupsPath); final Dataset results = ClusterUtils - .readPath(spark, workingPath + "/joinedEntities", OaBrokerMainEntity.class); + .readPath(spark, workingPath + "/joinedEntities_step4", OaBrokerMainEntity.class); final Dataset mergedRels = ClusterUtils .readPath(spark, graphPath + "/relation", Relation.class) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java index fe9c87e87..6e006ccf0 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java @@ -61,6 +61,7 @@ public class PrepareRelatedDatasetsJob { final Dataset rels = ClusterUtils .readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> r.getDataInfo().getDeletedbyinference()) .filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT)) .filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass())) .filter(r -> !ClusterUtils.isDedupRoot(r.getSource())) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java index 3ae240982..0af5d21b7 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java @@ -13,8 +13,6 @@ import org.apache.spark.sql.SaveMode; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.databind.ObjectMapper; - import eu.dnetlib.broker.objects.OaBrokerProject; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; @@ -29,8 +27,6 @@ public class PrepareRelatedProjectsJob { private static final Logger log = LoggerFactory.getLogger(PrepareRelatedProjectsJob.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils @@ -67,6 +63,7 @@ public class PrepareRelatedProjectsJob { final Dataset rels = ClusterUtils .readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> r.getDataInfo().getDeletedbyinference()) .filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT)) .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) .filter(r -> !ClusterUtils.isDedupRoot(r.getSource())) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java index 8814ef3e0..84752776e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java @@ -13,8 +13,6 @@ import org.apache.spark.sql.SaveMode; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.databind.ObjectMapper; - import eu.dnetlib.broker.objects.OaBrokerRelatedPublication; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; @@ -28,8 +26,6 @@ public class PrepareRelatedPublicationsJob { private static final Logger log = LoggerFactory.getLogger(PrepareRelatedPublicationsJob.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils @@ -68,6 +64,7 @@ public class PrepareRelatedPublicationsJob { final Dataset rels = ClusterUtils .readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> r.getDataInfo().getDeletedbyinference()) .filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT)) .filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass())) .filter(r -> !ClusterUtils.isDedupRoot(r.getSource())) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java index 0704fb44a..0ad753a97 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java @@ -13,8 +13,6 @@ import org.apache.spark.sql.SaveMode; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.databind.ObjectMapper; - import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; @@ -29,8 +27,6 @@ public class PrepareRelatedSoftwaresJob { private static final Logger log = LoggerFactory.getLogger(PrepareRelatedSoftwaresJob.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils @@ -67,6 +63,7 @@ public class PrepareRelatedSoftwaresJob { final Dataset rels = ClusterUtils .readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> r.getDataInfo().getDeletedbyinference()) .filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT)) .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) .filter(r -> !ClusterUtils.isDedupRoot(r.getSource())) diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml index 18e2eedca..8752200ff 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml @@ -216,14 +216,86 @@ - + - + yarn cluster - JoinEntitiesJob - eu.dnetlib.dhp.broker.oa.JoinEntitiesJob + JoinStep1 + eu.dnetlib.dhp.broker.oa.JoinStep1Job + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + yarn + cluster + JoinStep2 + eu.dnetlib.dhp.broker.oa.JoinStep2Job + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + yarn + cluster + JoinStep3 + eu.dnetlib.dhp.broker.oa.JoinStep3Job + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + yarn + cluster + JoinStep4 + eu.dnetlib.dhp.broker.oa.JoinStep4Job dhp-broker-events-${projectVersion}.jar --executor-cores=${sparkExecutorCores} diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml index 1ccdef929..26fa429e6 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml @@ -73,19 +73,19 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + yarn cluster - JoinEntitiesJob - eu.dnetlib.dhp.broker.oa.JoinEntitiesJob + JoinStep1 + eu.dnetlib.dhp.broker.oa.JoinStep1Job dhp-broker-events-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -100,16 +100,16 @@ --graphPath${graphInputPath} --workingPath${workingPath} - + - - + + yarn cluster - PrepareGroupsJob - eu.dnetlib.dhp.broker.oa.PrepareGroupsJob + JoinStep2 + eu.dnetlib.dhp.broker.oa.JoinStep2Job dhp-broker-events-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -124,16 +124,16 @@ --graphPath${graphInputPath} --workingPath${workingPath} - + - - + + yarn cluster - GenerateEventsJob - eu.dnetlib.dhp.broker.oa.GenerateEventsJob + JoinStep3 + eu.dnetlib.dhp.broker.oa.JoinStep3Job dhp-broker-events-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -145,9 +145,32 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + yarn + cluster + JoinStep4 + eu.dnetlib.dhp.broker.oa.JoinStep4Job + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} --workingPath${workingPath} - --isLookupUrl${isLookupUrl} - --dedupConfProfile${dedupConfProfId} From 0e723d378b3e1cf63cb40bdb27e5ab4cb272bfa2 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 24 Jun 2020 18:34:42 +0200 Subject: [PATCH 22/37] added default from vocab for missing instance.refereed; remove spurious prefixes from orcid values; WIP: prepare relation job --- .../oa/graph/clean/CleanGraphSparkJob.java | 3 + .../CreateRelatedEntitiesJob_phase1.java | 15 +- .../dhp/oa/provision/PrepareRelationsJob.java | 31 +- .../model/ProvisionModelSupport.java | 2 +- .../provision/model/RelatedEntityWrapper.java | 12 +- .../oa/provision/model/SortableRelation.java | 38 -- .../provision/model/SortableRelationKey.java | 90 ++++ .../provision/utils/RelationPartitioner.java | 6 +- .../oa/provision/utils/XmlRecordFactory.java | 2 +- .../dhp/oa/provision/oozie_app/workflow.xml | 480 +----------------- 10 files changed, 128 insertions(+), 551 deletions(-) delete mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java index 8f43ab1cf..bdbd64160 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java @@ -147,6 +147,9 @@ public class CleanGraphSparkJob { if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) { i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY); } + if (Objects.isNull(i.getRefereed())) { + i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS)); + } } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java index 4d2633bc5..80b800017 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java @@ -25,9 +25,7 @@ import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport; import eu.dnetlib.dhp.oa.provision.model.RelatedEntity; import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper; -import eu.dnetlib.dhp.oa.provision.model.SortableRelation; import eu.dnetlib.dhp.schema.common.EntityType; -import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; @@ -109,11 +107,12 @@ public class CreateRelatedEntitiesJob_phase1 { Class clazz, String outputPath) { - Dataset> relsByTarget = readPathRelation(spark, inputRelationsPath) + Dataset> relsByTarget = readPathRelation(spark, inputRelationsPath) .filter("dataInfo.deletedbyinference == false") .map( - (MapFunction>) r -> new Tuple2<>(r.getTarget(), r), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(SortableRelation.class))) + (MapFunction>) r -> new Tuple2<>(r.getTarget(), + r), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(Relation.class))) .cache(); Dataset> entities = readPathEntity(spark, inputEntityPath, clazz) @@ -129,7 +128,7 @@ public class CreateRelatedEntitiesJob_phase1 { relsByTarget .joinWith(entities, entities.col("_1").equalTo(relsByTarget.col("_1")), "inner") .map( - (MapFunction, Tuple2>, RelatedEntityWrapper>) t -> new RelatedEntityWrapper( + (MapFunction, Tuple2>, RelatedEntityWrapper>) t -> new RelatedEntityWrapper( t._1()._2(), t._2()._2()), Encoders.kryo(RelatedEntityWrapper.class)) .write() @@ -232,11 +231,11 @@ public class CreateRelatedEntitiesJob_phase1 { * @param relationPath * @return the Dataset containing all the relationships */ - private static Dataset readPathRelation( + private static Dataset readPathRelation( SparkSession spark, final String relationPath) { log.info("Reading relations from: {}", relationPath); - return spark.read().load(relationPath).as(Encoders.bean(SortableRelation.class)); + return spark.read().load(relationPath).as(Encoders.bean(Relation.class)); } private static void removeOutputDir(SparkSession spark, String path) { diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java index 6b184071a..d69b75b65 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java @@ -4,35 +4,28 @@ package eu.dnetlib.dhp.oa.provision; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.util.*; -import java.util.function.Function; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.*; import org.apache.spark.rdd.RDD; -import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.clearspring.analytics.util.Lists; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Splitter; import com.google.common.collect.Iterables; -import com.google.common.collect.Iterators; import com.google.common.collect.Sets; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.FunctionalInterfaceSupport; import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.provision.model.SortableRelation; +import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey; import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner; -import scala.Function1; -import scala.Tuple2; +import eu.dnetlib.dhp.schema.oaf.Relation; /** * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The @@ -133,22 +126,22 @@ public class PrepareRelationsJob { SparkSession spark, String inputRelationsPath, String outputPath, Set relationFilter, int maxRelations, int relPartitions) { - RDD cappedRels = readPathRelationRDD(spark, inputRelationsPath) + RDD cappedRels = readPathRelationRDD(spark, inputRelationsPath) .repartition(relPartitions) .filter(rel -> !rel.getDataInfo().getDeletedbyinference()) .filter(rel -> !relationFilter.contains(rel.getRelClass())) // group by SOURCE and apply limit - .mapToPair(rel -> new Tuple2<>(rel.getSource(), rel)) - .groupByKey(new RelationPartitioner(relPartitions)) - .flatMap(group -> Iterables.limit(group._2(), maxRelations).iterator()) + .groupBy(r -> SortableRelationKey.create(r, r.getSource())) + .repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions)) + .flatMap(t -> Iterables.limit(t._2(), maxRelations).iterator()) // group by TARGET and apply limit - .mapToPair(rel -> new Tuple2<>(rel.getTarget(), rel)) - .groupByKey(new RelationPartitioner(relPartitions)) - .flatMap(group -> Iterables.limit(group._2(), maxRelations).iterator()) + .groupBy(r -> SortableRelationKey.create(r, r.getTarget())) + .repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions)) + .flatMap(t -> Iterables.limit(t._2(), maxRelations).iterator()) .rdd(); spark - .createDataset(cappedRels, Encoders.bean(SortableRelation.class)) + .createDataset(cappedRels, Encoders.bean(Relation.class)) .write() .mode(SaveMode.Overwrite) .parquet(outputPath); @@ -162,10 +155,10 @@ public class PrepareRelationsJob { * @param inputPath * @return the JavaRDD containing all the relationships */ - private static JavaRDD readPathRelationRDD( + private static JavaRDD readPathRelationRDD( SparkSession spark, final String inputPath) { JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - return sc.textFile(inputPath).map(s -> OBJECT_MAPPER.readValue(s, SortableRelation.class)); + return sc.textFile(inputPath).map(s -> OBJECT_MAPPER.readValue(s, Relation.class)); } private static void removeOutputDir(SparkSession spark, String path) { diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java index f9fde14e5..051fe923d 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java @@ -19,7 +19,7 @@ public class ProvisionModelSupport { RelatedEntityWrapper.class, JoinedEntity.class, RelatedEntity.class, - SortableRelation.class)); + SortableRelationKey.class)); return modelClasses.toArray(new Class[] {}); } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntityWrapper.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntityWrapper.java index d708b6ed0..cbb143ee2 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntityWrapper.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntityWrapper.java @@ -5,28 +5,30 @@ import java.io.Serializable; import com.google.common.base.Objects; +import eu.dnetlib.dhp.schema.oaf.Relation; + public class RelatedEntityWrapper implements Serializable { - private SortableRelation relation; + private Relation relation; private RelatedEntity target; public RelatedEntityWrapper() { } - public RelatedEntityWrapper(SortableRelation relation, RelatedEntity target) { + public RelatedEntityWrapper(Relation relation, RelatedEntity target) { this(null, relation, target); } - public RelatedEntityWrapper(TypedRow entity, SortableRelation relation, RelatedEntity target) { + public RelatedEntityWrapper(TypedRow entity, Relation relation, RelatedEntity target) { this.relation = relation; this.target = target; } - public SortableRelation getRelation() { + public Relation getRelation() { return relation; } - public void setRelation(SortableRelation relation) { + public void setRelation(Relation relation) { this.relation = relation; } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java deleted file mode 100644 index b6571b9bf..000000000 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java +++ /dev/null @@ -1,38 +0,0 @@ - -package eu.dnetlib.dhp.oa.provision.model; - -import java.io.Serializable; -import java.util.Map; - -import com.google.common.collect.ComparisonChain; -import com.google.common.collect.Maps; - -import eu.dnetlib.dhp.schema.oaf.Relation; - -public class SortableRelation extends Relation implements Comparable, Serializable { - - private static final Map weights = Maps.newHashMap(); - - static { - weights.put("outcome", 0); - weights.put("supplement", 1); - weights.put("affiliation", 2); - weights.put("relationship", 3); - weights.put("publicationDataset", 4); - weights.put("similarity", 5); - - weights.put("provision", 6); - weights.put("participation", 7); - weights.put("dedup", 8); - } - - @Override - public int compareTo(Relation o) { - return ComparisonChain - .start() - .compare(weights.get(getSubRelType()), weights.get(o.getSubRelType())) - .compare(getSource(), o.getSource()) - .compare(getTarget(), o.getTarget()) - .result(); - } -} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java new file mode 100644 index 000000000..ad61fa044 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java @@ -0,0 +1,90 @@ + +package eu.dnetlib.dhp.oa.provision.model; + +import java.io.Serializable; +import java.util.Map; +import java.util.Optional; + +import com.google.common.collect.ComparisonChain; +import com.google.common.collect.Maps; + +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class SortableRelationKey implements Comparable, Serializable { + + private static final Map weights = Maps.newHashMap(); + + static { + weights.put("outcome", 0); + weights.put("supplement", 1); + weights.put("affiliation", 2); + weights.put("relationship", 3); + weights.put("publicationDataset", 4); + weights.put("similarity", 5); + + weights.put("provision", 6); + weights.put("participation", 7); + weights.put("dedup", 8); + } + + private String groupingKey; + + private String source; + + private String target; + + private String subRelType; + + public String getSource() { + return source; + } + + public static SortableRelationKey create(Relation r, String groupingKey) { + SortableRelationKey sr = new SortableRelationKey(); + sr.setGroupingKey(groupingKey); + sr.setSource(r.getSource()); + sr.setTarget(r.getTarget()); + sr.setSubRelType(r.getSubRelType()); + return sr; + } + + @Override + public int compareTo(SortableRelationKey o) { + final Integer wt = Optional.ofNullable(weights.get(getSubRelType())).orElse(Integer.MAX_VALUE); + final Integer wo = Optional.ofNullable(weights.get(o.getSubRelType())).orElse(Integer.MAX_VALUE); + return ComparisonChain + .start() + .compare(wt, wo) + .compare(getSource(), o.getSource()) + .compare(getTarget(), o.getTarget()) + .result(); + } + + public void setSource(String source) { + this.source = source; + } + + public String getTarget() { + return target; + } + + public void setTarget(String target) { + this.target = target; + } + + public String getSubRelType() { + return subRelType; + } + + public void setSubRelType(String subRelType) { + this.subRelType = subRelType; + } + + public String getGroupingKey() { + return groupingKey; + } + + public void setGroupingKey(String groupingKey) { + this.groupingKey = groupingKey; + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java index c7862b48a..bdece36ab 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java @@ -4,6 +4,8 @@ package eu.dnetlib.dhp.oa.provision.utils; import org.apache.spark.Partitioner; import org.apache.spark.util.Utils; +import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey; + /** * Used in combination with SortableRelationKey, allows to partition the records by source id, therefore allowing to * sort relations sharing the same source id by the ordering defined in SortableRelationKey. @@ -23,8 +25,8 @@ public class RelationPartitioner extends Partitioner { @Override public int getPartition(Object key) { - String partitionKey = (String) key; - return Utils.nonNegativeMod(partitionKey.hashCode(), numPartitions()); + SortableRelationKey partitionKey = (SortableRelationKey) key; + return Utils.nonNegativeMod(partitionKey.getGroupingKey().hashCode(), numPartitions()); } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java index b2aa01dc7..5d8d9fa20 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java @@ -276,7 +276,7 @@ public class XmlRecordFactory implements Serializable { pidType, pidValue .toLowerCase() - .replaceAll("orcid", ""))); + .replaceAll("^.*orcid\\.org\\/", ""))); } } }); diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml index 0d5121cf1..e98cbbc73 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml @@ -97,18 +97,7 @@ - - - - - ${wf:conf('resumeFrom') eq 'prepare_relations'} - ${wf:conf('resumeFrom') eq 'fork_join_related_entities'} - ${wf:conf('resumeFrom') eq 'fork_join_all_entities'} - ${wf:conf('resumeFrom') eq 'convert_to_xml'} - ${wf:conf('resumeFrom') eq 'to_solr_index'} - - - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -135,475 +124,12 @@ --outputPath${workingDir}/relation --relPartitions5000 - - - - - - - - - - - - - - - - - yarn - cluster - Join[relation.target = publication.id] - eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1 - dhp-graph-provision-${projectVersion}.jar - - --executor-cores=${sparkExecutorCoresForJoining} - --executor-memory=${sparkExecutorMemoryForJoining} - --driver-memory=${sparkDriverMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - --conf spark.network.timeout=${sparkNetworkTimeout} - - --inputRelationsPath${workingDir}/relation - --inputEntityPath${inputGraphRootPath}/publication - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication - --outputPath${workingDir}/join_partial/publication - - - - - - - - yarn - cluster - Join[relation.target = dataset.id] - eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1 - dhp-graph-provision-${projectVersion}.jar - - --executor-cores=${sparkExecutorCoresForJoining} - --executor-memory=${sparkExecutorMemoryForJoining} - --driver-memory=${sparkDriverMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 - --conf spark.network.timeout=${sparkNetworkTimeout} - - --inputRelationsPath${workingDir}/relation - --inputEntityPath${inputGraphRootPath}/dataset - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset - --outputPath${workingDir}/join_partial/dataset - - - - - - - - yarn - cluster - Join[relation.target = otherresearchproduct.id] - eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1 - dhp-graph-provision-${projectVersion}.jar - - --executor-cores=${sparkExecutorCoresForJoining} - --executor-memory=${sparkExecutorMemoryForJoining} - --driver-memory=${sparkDriverMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 - --conf spark.network.timeout=${sparkNetworkTimeout} - - --inputRelationsPath${workingDir}/relation - --inputEntityPath${inputGraphRootPath}/otherresearchproduct - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --outputPath${workingDir}/join_partial/otherresearchproduct - - - - - - - - yarn - cluster - Join[relation.target = software.id] - eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1 - dhp-graph-provision-${projectVersion}.jar - - --executor-cores=${sparkExecutorCoresForJoining} - --executor-memory=${sparkExecutorMemoryForJoining} - --driver-memory=${sparkDriverMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 - --conf spark.network.timeout=${sparkNetworkTimeout} - - --inputRelationsPath${workingDir}/relation - --inputEntityPath${inputGraphRootPath}/software - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software - --outputPath${workingDir}/join_partial/software - - - - - - - - yarn - cluster - Join[relation.target = datasource.id] - eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1 - dhp-graph-provision-${projectVersion}.jar - - --executor-cores=${sparkExecutorCoresForJoining} - --executor-memory=${sparkExecutorMemoryForJoining} - --driver-memory=${sparkDriverMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 - --conf spark.network.timeout=${sparkNetworkTimeout} - - --inputRelationsPath${workingDir}/relation - --inputEntityPath${inputGraphRootPath}/datasource - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Datasource - --outputPath${workingDir}/join_partial/datasource - - - - - - - - yarn - cluster - Join[relation.target = organization.id] - eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1 - dhp-graph-provision-${projectVersion}.jar - - --executor-cores=${sparkExecutorCoresForJoining} - --executor-memory=${sparkExecutorMemoryForJoining} - --driver-memory=${sparkDriverMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 - --conf spark.network.timeout=${sparkNetworkTimeout} - - --inputRelationsPath${workingDir}/relation - --inputEntityPath${inputGraphRootPath}/organization - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Organization - --outputPath${workingDir}/join_partial/organization - - - - - - - - yarn - cluster - Join[relation.target = project.id] - eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1 - dhp-graph-provision-${projectVersion}.jar - - --executor-cores=${sparkExecutorCoresForJoining} - --executor-memory=${sparkExecutorMemoryForJoining} - --driver-memory=${sparkDriverMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 - --conf spark.network.timeout=${sparkNetworkTimeout} - - --inputRelationsPath${workingDir}/relation - --inputEntityPath${inputGraphRootPath}/project - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Project - --outputPath${workingDir}/join_partial/project - - - - - - - - - - - - - - - - - - - - yarn - cluster - Join[publication.id = relatedEntity.source] - eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2 - dhp-graph-provision-${projectVersion}.jar - - --executor-cores=${sparkExecutorCoresForJoining} - --executor-memory=${sparkExecutorMemoryForJoining} - --driver-memory=${sparkDriverMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=15360 - --conf spark.network.timeout=${sparkNetworkTimeout} - - --inputEntityPath${inputGraphRootPath}/publication - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication - --inputRelatedEntitiesPath${workingDir}/join_partial - --outputPath${workingDir}/join_entities/publication - --numPartitions30000 - - - - - - - - yarn - cluster - Join[dataset.id = relatedEntity.source] - eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2 - dhp-graph-provision-${projectVersion}.jar - - --executor-cores=${sparkExecutorCoresForJoining} - --executor-memory=${sparkExecutorMemoryForJoining} - --driver-memory=${sparkDriverMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - --conf spark.network.timeout=${sparkNetworkTimeout} - - --inputEntityPath${inputGraphRootPath}/dataset - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset - --inputRelatedEntitiesPath${workingDir}/join_partial - --outputPath${workingDir}/join_entities/dataset - --numPartitions20000 - - - - - - - - yarn - cluster - Join[otherresearchproduct.id = relatedEntity.source] - eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2 - dhp-graph-provision-${projectVersion}.jar - - --executor-cores=${sparkExecutorCoresForJoining} - --executor-memory=${sparkExecutorMemoryForJoining} - --driver-memory=${sparkDriverMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - --conf spark.network.timeout=${sparkNetworkTimeout} - - --inputEntityPath${inputGraphRootPath}/otherresearchproduct - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --inputRelatedEntitiesPath${workingDir}/join_partial - --outputPath${workingDir}/join_entities/otherresearchproduct - --numPartitions10000 - - - - - - - - yarn - cluster - Join[software.id = relatedEntity.source] - eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2 - dhp-graph-provision-${projectVersion}.jar - - --executor-cores=${sparkExecutorCoresForJoining} - --executor-memory=${sparkExecutorMemoryForJoining} - --driver-memory=${sparkDriverMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 - --conf spark.network.timeout=${sparkNetworkTimeout} - - --inputEntityPath${inputGraphRootPath}/software - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software - --inputRelatedEntitiesPath${workingDir}/join_partial - --outputPath${workingDir}/join_entities/software - --numPartitions10000 - - - - - - - - yarn - cluster - Join[datasource.id = relatedEntity.source] - eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2 - dhp-graph-provision-${projectVersion}.jar - - --executor-cores=${sparkExecutorCoresForJoining} - --executor-memory=${sparkExecutorMemoryForJoining} - --driver-memory=${sparkDriverMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - --conf spark.network.timeout=${sparkNetworkTimeout} - - --inputEntityPath${inputGraphRootPath}/datasource - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Datasource - --inputRelatedEntitiesPath${workingDir}/join_partial - --outputPath${workingDir}/join_entities/datasource - --numPartitions1000 - - - - - - - - yarn - cluster - Join[organization.id = relatedEntity.source] - eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2 - dhp-graph-provision-${projectVersion}.jar - - --executor-cores=${sparkExecutorCoresForJoining} - --executor-memory=${sparkExecutorMemoryForJoining} - --driver-memory=${sparkDriverMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - --conf spark.network.timeout=${sparkNetworkTimeout} - - --inputEntityPath${inputGraphRootPath}/organization - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Organization - --inputRelatedEntitiesPath${workingDir}/join_partial - --outputPath${workingDir}/join_entities/organization - --numPartitions20000 - - - - - - - - yarn - cluster - Join[project.id = relatedEntity.source] - eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2 - dhp-graph-provision-${projectVersion}.jar - - --executor-cores=${sparkExecutorCoresForJoining} - --executor-memory=${sparkExecutorMemoryForJoining} - --driver-memory=${sparkDriverMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 - --conf spark.network.timeout=${sparkNetworkTimeout} - - --inputEntityPath${inputGraphRootPath}/project - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Project - --inputRelatedEntitiesPath${workingDir}/join_partial - --outputPath${workingDir}/join_entities/project - --numPartitions10000 - - - - - - - - - - yarn - cluster - convert_to_xml - eu.dnetlib.dhp.oa.provision.XmlConverterJob - dhp-graph-provision-${projectVersion}.jar - - --executor-cores=${sparkExecutorCoresForJoining} - --executor-memory=${sparkExecutorMemoryForJoining} - --driver-memory=${sparkDriverMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 - --conf spark.network.timeout=${sparkNetworkTimeout} - - --inputPath${workingDir}/join_entities - --outputPath${workingDir}/xml - --isLookupUrl${isLookupUrl} - --otherDsTypeId${otherDsTypeId} - - - - - - - - yarn - cluster - to_solr_index - eu.dnetlib.dhp.oa.provision.XmlIndexingJob - dhp-graph-provision-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemoryForIndexing} - --driver-memory=${sparkDriverMemoryForIndexing} - --conf spark.dynamicAllocation.enabled=true - --conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForIndexing} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.speculation=false - --conf spark.hadoop.mapreduce.map.speculative=false - --conf spark.hadoop.mapreduce.reduce.speculative=false - - --inputPath${workingDir}/xml - --isLookupUrl${isLookupUrl} - --format${format} - --batchSize${batchSize} - + + \ No newline at end of file From 46e76affeb8658e7aae037c9f20b84f803e9de6b Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 24 Jun 2020 19:01:15 +0200 Subject: [PATCH 23/37] WIP: prepare relation job --- .../dhp/oa/provision/PrepareRelationsJob.java | 45 ++++++++++++++++++- .../provision/model/SortableRelationKey.java | 39 ++++++---------- 2 files changed, 57 insertions(+), 27 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java index d69b75b65..6b34899c8 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java @@ -3,7 +3,9 @@ package eu.dnetlib.dhp.oa.provision; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import java.io.Serializable; import java.util.*; +import java.util.function.Supplier; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; @@ -18,7 +20,9 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Splitter; +import com.google.common.collect.ComparisonChain; import com.google.common.collect.Iterables; +import com.google.common.collect.Maps; import com.google.common.collect.Sets; import eu.dnetlib.dhp.application.ArgumentApplicationParser; @@ -59,6 +63,21 @@ public class PrepareRelationsJob { public static final int DEFAULT_NUM_PARTITIONS = 3000; + private static final Map weights = Maps.newHashMap(); + + static { + weights.put("outcome", 0); + weights.put("supplement", 1); + weights.put("affiliation", 2); + weights.put("relationship", 3); + weights.put("publicationDataset", 4); + weights.put("similarity", 5); + + weights.put("provision", 6); + weights.put("participation", 7); + weights.put("dedup", 8); + } + public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils .toString( @@ -132,11 +151,15 @@ public class PrepareRelationsJob { .filter(rel -> !relationFilter.contains(rel.getRelClass())) // group by SOURCE and apply limit .groupBy(r -> SortableRelationKey.create(r, r.getSource())) - .repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions)) + .repartitionAndSortWithinPartitions( + new RelationPartitioner(relPartitions), + (SerializableComparator) (o1, o2) -> compare(o1, o2)) .flatMap(t -> Iterables.limit(t._2(), maxRelations).iterator()) // group by TARGET and apply limit .groupBy(r -> SortableRelationKey.create(r, r.getTarget())) - .repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions)) + .repartitionAndSortWithinPartitions( + new RelationPartitioner(relPartitions), + (SerializableComparator) (o1, o2) -> compare(o1, o2)) .flatMap(t -> Iterables.limit(t._2(), maxRelations).iterator()) .rdd(); @@ -147,6 +170,24 @@ public class PrepareRelationsJob { .parquet(outputPath); } + private static int compare(SortableRelationKey o1, SortableRelationKey o2) { + final Integer w1 = Optional.ofNullable(weights.get(o1.getSubRelType())).orElse(Integer.MAX_VALUE); + final Integer w2 = Optional.ofNullable(weights.get(o2.getSubRelType())).orElse(Integer.MAX_VALUE); + return ComparisonChain + .start() + .compare(w1, w2) + .compare(o1.getSource(), o2.getSource()) + .compare(o1.getTarget(), o2.getTarget()) + .result(); + } + + @FunctionalInterface + public interface SerializableComparator extends Comparator, Serializable { + + @Override + int compare(T o1, T o2); + } + /** * Reads a JavaRDD of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text * file, diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java index ad61fa044..ab6518809 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java @@ -5,27 +5,13 @@ import java.io.Serializable; import java.util.Map; import java.util.Optional; +import com.google.common.base.Objects; import com.google.common.collect.ComparisonChain; import com.google.common.collect.Maps; import eu.dnetlib.dhp.schema.oaf.Relation; -public class SortableRelationKey implements Comparable, Serializable { - - private static final Map weights = Maps.newHashMap(); - - static { - weights.put("outcome", 0); - weights.put("supplement", 1); - weights.put("affiliation", 2); - weights.put("relationship", 3); - weights.put("publicationDataset", 4); - weights.put("similarity", 5); - - weights.put("provision", 6); - weights.put("participation", 7); - weights.put("dedup", 8); - } +public class SortableRelationKey implements Serializable { private String groupingKey; @@ -49,15 +35,18 @@ public class SortableRelationKey implements Comparable, Ser } @Override - public int compareTo(SortableRelationKey o) { - final Integer wt = Optional.ofNullable(weights.get(getSubRelType())).orElse(Integer.MAX_VALUE); - final Integer wo = Optional.ofNullable(weights.get(o.getSubRelType())).orElse(Integer.MAX_VALUE); - return ComparisonChain - .start() - .compare(wt, wo) - .compare(getSource(), o.getSource()) - .compare(getTarget(), o.getTarget()) - .result(); + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + SortableRelationKey that = (SortableRelationKey) o; + return Objects.equal(getGroupingKey(), that.getGroupingKey()); + } + + @Override + public int hashCode() { + return Objects.hashCode(getGroupingKey()); } public void setSource(String source) { From 77d2a1b1c49f63ec8faeea6404c60c8d2bfcb6f1 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Thu, 25 Jun 2020 09:28:13 +0200 Subject: [PATCH 24/37] params to choose sql queries for beta or production --- .../dhp/broker/oa/matchers/UpdateMatcher.java | 1 + .../AbstractEnrichMissingDataset.java | 6 +- .../AbstractEnrichMissingPublication.java | 6 +- .../EnrichMissingSoftware.java | 2 +- .../relatedSoftware/EnrichMoreSoftware.java | 2 +- .../dhp/broker/oa/util/BrokerConstants.java | 2 + .../withRels/RelatedDatasetAggregator.java | 12 ++- .../withRels/RelatedProjectAggregator.java | 12 ++- .../RelatedPublicationAggregator.java | 13 ++- .../withRels/RelatedSoftwareAggregator.java | 12 ++- .../broker/oa/partial/oozie_app/workflow.xml | 83 ++----------------- .../raw/MigrateDbEntitiesApplication.java | 11 ++- .../graph/migrate_db_entities_parameters.json | 6 ++ .../oa/graph/raw_all/oozie_app/workflow.xml | 7 ++ 14 files changed, 79 insertions(+), 96 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java index 9aa6f5384..4691ed65e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java @@ -45,6 +45,7 @@ public abstract class UpdateMatcher { final Topic topic = getTopicFunction().apply(hl); final UpdateInfo info = new UpdateInfo<>(topic, hl, source, res, getCompileHighlightFunction(), getHighlightToStringFunction(), dedupConfig); + final String s = DigestUtils.md5Hex(info.getHighlightValueAsString()); if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) { } else { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java index c197734a3..c8b93596a 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java @@ -16,7 +16,7 @@ public abstract class AbstractEnrichMissingDataset extends UpdateMatcher topic, (p, rel) -> p.getDatasets().add(rel), - rel -> rel.getOriginalId()); + rel -> rel.getOpenaireId()); } protected abstract boolean filterByType(String relType); @@ -29,14 +29,14 @@ public abstract class AbstractEnrichMissingDataset extends UpdateMatcher filterByType(rel.getRelType())) - .map(OaBrokerRelatedDataset::getOriginalId) + .map(OaBrokerRelatedDataset::getOpenaireId) .collect(Collectors.toSet()); return source .getDatasets() .stream() .filter(rel -> filterByType(rel.getRelType())) - .filter(d -> !existingDatasets.contains(d.getOriginalId())) + .filter(d -> !existingDatasets.contains(d.getOpenaireId())) .collect(Collectors.toList()); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java index ad6d8263b..cc4f68f87 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java @@ -16,7 +16,7 @@ public abstract class AbstractEnrichMissingPublication extends UpdateMatcher topic, (p, rel) -> p.getPublications().add(rel), - rel -> rel.getOriginalId()); + rel -> rel.getOpenaireId()); } @@ -31,14 +31,14 @@ public abstract class AbstractEnrichMissingPublication extends UpdateMatcher filterByType(rel.getRelType())) - .map(OaBrokerRelatedPublication::getOriginalId) + .map(OaBrokerRelatedPublication::getOpenaireId) .collect(Collectors.toSet()); return source .getPublications() .stream() .filter(rel -> filterByType(rel.getRelType())) - .filter(p -> !existingPublications.contains(p.getOriginalId())) + .filter(p -> !existingPublications.contains(p.getOpenaireId())) .collect(Collectors.toList()); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java index 452caa503..d01f0c370 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java @@ -16,7 +16,7 @@ public class EnrichMissingSoftware super(true, s -> Topic.ENRICH_MISSING_SOFTWARE, (p, s) -> p.getSoftwares().add(s), - s -> s.getName()); + s -> s.getOpenaireId()); } @Override diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java index aaffe1249..a612b6074 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java @@ -16,7 +16,7 @@ public class EnrichMoreSoftware extends UpdateMatcher { super(true, s -> Topic.ENRICH_MORE_SOFTWARE, (p, s) -> p.getSoftwares().add(s), - s -> s.getName()); + s -> s.getOpenaireId()); } @Override diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java index 49c46c7f0..58e41acbb 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java @@ -17,6 +17,8 @@ public class BrokerConstants { public static final float MIN_TRUST = 0.25f; public static final float MAX_TRUST = 1.00f; + public static final int MAX_NUMBER_OF_RELS = 20; + public static Class[] getModelClasses() { final Set> list = new HashSet<>(); list.addAll(Arrays.asList(ModelSupport.getOafModelClasses())); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasetAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasetAggregator.java index a963f073d..45000f6f3 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasetAggregator.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasetAggregator.java @@ -7,6 +7,7 @@ import org.apache.spark.sql.Encoders; import org.apache.spark.sql.expressions.Aggregator; import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; import scala.Tuple2; public class RelatedDatasetAggregator @@ -30,7 +31,7 @@ public class RelatedDatasetAggregator @Override public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2 t) { final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1; - if (t._2 != null) { + if (t._2 != null && res.getDatasets().size() < BrokerConstants.MAX_NUMBER_OF_RELS) { res.getDatasets().add(t._2.getRelDataset()); } return res; @@ -40,7 +41,14 @@ public class RelatedDatasetAggregator @Override public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { if (StringUtils.isNotBlank(g1.getOpenaireId())) { - g1.getDatasets().addAll(g2.getDatasets()); + final int availables = BrokerConstants.MAX_NUMBER_OF_RELS - g1.getDatasets().size(); + if (availables > 0) { + if (g2.getDatasets().size() <= availables) { + g1.getDatasets().addAll(g2.getDatasets()); + } else { + g1.getDatasets().addAll(g2.getDatasets().subList(0, availables)); + } + } return g1; } else { return g2; diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProjectAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProjectAggregator.java index 3fedb1a32..787217837 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProjectAggregator.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProjectAggregator.java @@ -7,6 +7,7 @@ import org.apache.spark.sql.Encoders; import org.apache.spark.sql.expressions.Aggregator; import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; import scala.Tuple2; public class RelatedProjectAggregator @@ -30,7 +31,7 @@ public class RelatedProjectAggregator @Override public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2 t) { final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1; - if (t._2 != null) { + if (t._2 != null && res.getProjects().size() < BrokerConstants.MAX_NUMBER_OF_RELS) { res.getProjects().add(t._2.getRelProject()); } return res; @@ -40,7 +41,14 @@ public class RelatedProjectAggregator @Override public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { if (StringUtils.isNotBlank(g1.getOpenaireId())) { - g1.getProjects().addAll(g2.getProjects()); + final int availables = BrokerConstants.MAX_NUMBER_OF_RELS - g1.getProjects().size(); + if (availables > 0) { + if (g2.getProjects().size() <= availables) { + g1.getProjects().addAll(g2.getProjects()); + } else { + g1.getProjects().addAll(g2.getProjects().subList(0, availables)); + } + } return g1; } else { return g2; diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublicationAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublicationAggregator.java index b331599ad..2289ebe36 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublicationAggregator.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublicationAggregator.java @@ -7,6 +7,7 @@ import org.apache.spark.sql.Encoders; import org.apache.spark.sql.expressions.Aggregator; import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; import scala.Tuple2; public class RelatedPublicationAggregator @@ -31,7 +32,7 @@ public class RelatedPublicationAggregator public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2 t) { final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1; - if (t._2 != null) { + if (t._2 != null && res.getPublications().size() < BrokerConstants.MAX_NUMBER_OF_RELS) { res.getPublications().add(t._2.getRelPublication()); } return res; @@ -41,8 +42,16 @@ public class RelatedPublicationAggregator @Override public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { if (StringUtils.isNotBlank(g1.getOpenaireId())) { - g1.getPublications().addAll(g2.getPublications()); + final int availables = BrokerConstants.MAX_NUMBER_OF_RELS - g1.getPublications().size(); + if (availables > 0) { + if (g2.getPublications().size() <= availables) { + g1.getPublications().addAll(g2.getPublications()); + } else { + g1.getPublications().addAll(g2.getPublications().subList(0, availables)); + } + } return g1; + } else { return g2; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftwareAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftwareAggregator.java index d3b1c3407..fedb3c9e9 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftwareAggregator.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftwareAggregator.java @@ -7,6 +7,7 @@ import org.apache.spark.sql.Encoders; import org.apache.spark.sql.expressions.Aggregator; import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; import scala.Tuple2; public class RelatedSoftwareAggregator @@ -30,7 +31,7 @@ public class RelatedSoftwareAggregator @Override public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2 t) { final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1; - if (t._2 != null) { + if (t._2 != null && res.getSoftwares().size() < BrokerConstants.MAX_NUMBER_OF_RELS) { res.getSoftwares().add(t._2.getRelSoftware()); } return res; @@ -40,7 +41,14 @@ public class RelatedSoftwareAggregator @Override public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { if (StringUtils.isNotBlank(g1.getOpenaireId())) { - g1.getSoftwares().addAll(g2.getSoftwares()); + final int availables = BrokerConstants.MAX_NUMBER_OF_RELS - g1.getSoftwares().size(); + if (availables > 0) { + if (g2.getSoftwares().size() <= availables) { + g1.getSoftwares().addAll(g2.getSoftwares()); + } else { + g1.getSoftwares().addAll(g2.getSoftwares().subList(0, availables)); + } + } return g1; } else { return g2; diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml index 26fa429e6..fd68bfec2 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml @@ -73,19 +73,19 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + yarn cluster - JoinStep1 - eu.dnetlib.dhp.broker.oa.JoinStep1Job + GenerateEventsJob + eu.dnetlib.dhp.broker.oa.GenerateEventsJob dhp-broker-events-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -97,80 +97,9 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 - --graphPath${graphInputPath} - --workingPath${workingPath} - - - - - - - - yarn - cluster - JoinStep2 - eu.dnetlib.dhp.broker.oa.JoinStep2Job - dhp-broker-events-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 - - --graphPath${graphInputPath} - --workingPath${workingPath} - - - - - - - - yarn - cluster - JoinStep3 - eu.dnetlib.dhp.broker.oa.JoinStep3Job - dhp-broker-events-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 - - --graphPath${graphInputPath} - --workingPath${workingPath} - - - - - - - - yarn - cluster - JoinStep4 - eu.dnetlib.dhp.broker.oa.JoinStep4Job - dhp-broker-events-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 - - --graphPath${graphInputPath} --workingPath${workingPath} + --isLookupUrl${isLookupUrl} + --dedupConfProfile${dedupConfProfId} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index 0bad89e9e..da2ba4723 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -50,8 +50,6 @@ import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.common.collect.Lists; - import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.DbClient; import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; @@ -106,6 +104,9 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i final String dbPassword = parser.get("postgresPassword"); log.info("postgresPassword: xxx"); + final String dbSchema = parser.get("dbschema"); + log.info("dbSchema {}: " + dbSchema); + final String isLookupUrl = parser.get("isLookupUrl"); log.info("isLookupUrl: {}", isLookupUrl); @@ -125,7 +126,11 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i smdbe.execute("queryDatasources.sql", smdbe::processDatasource); log.info("Processing projects..."); - smdbe.execute("queryProjects.sql", smdbe::processProject); + if (dbSchema.equalsIgnoreCase("beta")) { + smdbe.execute("queryProjects.sql", smdbe::processProject); + } else { + smdbe.execute("queryProjects_production.sql", smdbe::processProject); + } log.info("Processing orgs..."); smdbe.execute("queryOrganizations.sql", smdbe::processOrganization); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json index 4e838561d..6dfef32db 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json @@ -34,5 +34,11 @@ "paramLongName": "isLookupUrl", "paramDescription": "the url of the ISLookupService", "paramRequired": true + }, + { + "paramName": "dbschema", + "paramLongName": "dbschema", + "paramDescription": "the database schema according to the D-Net infrastructure (beta or production)", + "paramRequired": true } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml index 9a7e36570..3bf3cc7a7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml @@ -25,6 +25,11 @@ postgresPassword the password postgres + + + dbSchema + the database schema according to the D-Net infrastructure (beta or production) + beta mongoURL @@ -125,6 +130,7 @@ --postgresPassword${postgresPassword} --isLookupUrl${isLookupUrl} --actionclaims + --dbschema${dbSchema} @@ -175,6 +181,7 @@ --postgresUser${postgresUser} --postgresPassword${postgresPassword} --isLookupUrl${isLookupUrl} + --dbschema${dbSchema} From abcbebcbb4a52f8ee74281693d7845548243f60f Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Thu, 25 Jun 2020 09:50:46 +0200 Subject: [PATCH 25/37] fixed generation of ids --- .../oa/graph/raw/common/OafMapperUtils.java | 26 +++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java index 58f068943..8ede40773 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java @@ -9,7 +9,15 @@ import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; -import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.ExtraInfo; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.Journal; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.OAIProvenance; +import eu.dnetlib.dhp.schema.oaf.OriginDescription; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.utils.DHPUtils; public class OafMapperUtils { @@ -89,7 +97,9 @@ public class OafMapperUtils { } public static StructuredProperty structuredProperty( - final String value, final Qualifier qualifier, final DataInfo dataInfo) { + final String value, + final Qualifier qualifier, + final DataInfo dataInfo) { if (value == null) { return null; } @@ -192,8 +202,12 @@ public class OafMapperUtils { } public static String createOpenaireId( - final int prefix, final String originalId, final boolean to_md5) { - if (to_md5) { + final int prefix, + final String originalId, + final boolean to_md5) { + if (StringUtils.isBlank(originalId)) { + return null; + } else if (to_md5) { final String nsPrefix = StringUtils.substringBefore(originalId, "::"); final String rest = StringUtils.substringAfter(originalId, "::"); return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest)); @@ -203,7 +217,9 @@ public class OafMapperUtils { } public static String createOpenaireId( - final String type, final String originalId, final boolean to_md5) { + final String type, + final String originalId, + final boolean to_md5) { switch (type) { case "datasource": return createOpenaireId(10, originalId, to_md5); From 69b0391708edcc5b3d0ef3dcd73b68ae3a0ca51a Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 25 Jun 2020 10:19:56 +0200 Subject: [PATCH 26/37] WIP: prepare relation job --- .../dhp/oa/provision/PrepareRelationsJob.java | 67 ++++++------------- .../provision/model/SortableRelationKey.java | 48 +++++++------ .../provision/utils/RelationPartitioner.java | 12 ++++ 3 files changed, 62 insertions(+), 65 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java index 6b34899c8..4ae822df7 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java @@ -11,6 +11,8 @@ import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.rdd.RDD; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; @@ -30,6 +32,7 @@ import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey; import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner; import eu.dnetlib.dhp.schema.oaf.Relation; +import scala.Tuple2; /** * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The @@ -63,21 +66,6 @@ public class PrepareRelationsJob { public static final int DEFAULT_NUM_PARTITIONS = 3000; - private static final Map weights = Maps.newHashMap(); - - static { - weights.put("outcome", 0); - weights.put("supplement", 1); - weights.put("affiliation", 2); - weights.put("relationship", 3); - weights.put("publicationDataset", 4); - weights.put("similarity", 5); - - weights.put("provision", 6); - weights.put("participation", 7); - weights.put("dedup", 8); - } - public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils .toString( @@ -146,21 +134,26 @@ public class PrepareRelationsJob { int relPartitions) { RDD cappedRels = readPathRelationRDD(spark, inputRelationsPath) - .repartition(relPartitions) - .filter(rel -> !rel.getDataInfo().getDeletedbyinference()) - .filter(rel -> !relationFilter.contains(rel.getRelClass())) + .filter(rel -> rel.getDataInfo().getDeletedbyinference() == false) + .filter(rel -> relationFilter.contains(rel.getRelClass()) == false) + // group by SOURCE and apply limit - .groupBy(r -> SortableRelationKey.create(r, r.getSource())) - .repartitionAndSortWithinPartitions( - new RelationPartitioner(relPartitions), - (SerializableComparator) (o1, o2) -> compare(o1, o2)) - .flatMap(t -> Iterables.limit(t._2(), maxRelations).iterator()) + .mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, r.getSource()), r)) + .repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions)) + .groupBy(Tuple2::_1) + .map(Tuple2::_2) + .map(t -> Iterables.limit(t, maxRelations)) + .flatMap(Iterable::iterator) + .map(Tuple2::_2) + // group by TARGET and apply limit - .groupBy(r -> SortableRelationKey.create(r, r.getTarget())) - .repartitionAndSortWithinPartitions( - new RelationPartitioner(relPartitions), - (SerializableComparator) (o1, o2) -> compare(o1, o2)) - .flatMap(t -> Iterables.limit(t._2(), maxRelations).iterator()) + .mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, r.getTarget()), r)) + .repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions)) + .groupBy(Tuple2::_1) + .map(Tuple2::_2) + .map(t -> Iterables.limit(t, maxRelations)) + .flatMap(Iterable::iterator) + .map(Tuple2::_2) .rdd(); spark @@ -170,24 +163,6 @@ public class PrepareRelationsJob { .parquet(outputPath); } - private static int compare(SortableRelationKey o1, SortableRelationKey o2) { - final Integer w1 = Optional.ofNullable(weights.get(o1.getSubRelType())).orElse(Integer.MAX_VALUE); - final Integer w2 = Optional.ofNullable(weights.get(o2.getSubRelType())).orElse(Integer.MAX_VALUE); - return ComparisonChain - .start() - .compare(w1, w2) - .compare(o1.getSource(), o2.getSource()) - .compare(o1.getTarget(), o2.getTarget()) - .result(); - } - - @FunctionalInterface - public interface SerializableComparator extends Comparator, Serializable { - - @Override - int compare(T o1, T o2); - } - /** * Reads a JavaRDD of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text * file, diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java index ab6518809..e96c4ca5c 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java @@ -11,25 +11,34 @@ import com.google.common.collect.Maps; import eu.dnetlib.dhp.schema.oaf.Relation; -public class SortableRelationKey implements Serializable { +public class SortableRelationKey implements Comparable, Serializable { + + private static final Map weights = Maps.newHashMap(); + + static { + weights.put("outcome", 0); + weights.put("supplement", 1); + weights.put("review", 2); + weights.put("citation", 3); + weights.put("affiliation", 4); + weights.put("relationship", 5); + weights.put("publicationDataset", 6); + weights.put("similarity", 7); + + weights.put("provision", 8); + weights.put("participation", 9); + weights.put("dedup", 10); + } + + private static final long serialVersionUID = 3232323; private String groupingKey; - private String source; - - private String target; - private String subRelType; - public String getSource() { - return source; - } - public static SortableRelationKey create(Relation r, String groupingKey) { SortableRelationKey sr = new SortableRelationKey(); sr.setGroupingKey(groupingKey); - sr.setSource(r.getSource()); - sr.setTarget(r.getTarget()); sr.setSubRelType(r.getSubRelType()); return sr; } @@ -49,16 +58,16 @@ public class SortableRelationKey implements Serializable { return Objects.hashCode(getGroupingKey()); } - public void setSource(String source) { - this.source = source; + @Override + public int compareTo(SortableRelationKey o) { + return ComparisonChain + .start() + .compare(getWeight(this), getWeight(o)) + .result() * -1; } - public String getTarget() { - return target; - } - - public void setTarget(String target) { - this.target = target; + private Integer getWeight(SortableRelationKey o) { + return Optional.ofNullable(weights.get(o.getSubRelType())).orElse(Integer.MAX_VALUE); } public String getSubRelType() { @@ -76,4 +85,5 @@ public class SortableRelationKey implements Serializable { public void setGroupingKey(String groupingKey) { this.groupingKey = groupingKey; } + } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java index bdece36ab..7bd8b9217 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java @@ -12,6 +12,8 @@ import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey; */ public class RelationPartitioner extends Partitioner { + private static final long serialVersionUID = 343434456L; + private final int numPartitions; public RelationPartitioner(int numPartitions) { @@ -29,4 +31,14 @@ public class RelationPartitioner extends Partitioner { return Utils.nonNegativeMod(partitionKey.getGroupingKey().hashCode(), numPartitions()); } + @Override + public boolean equals(Object obj) { + if (obj instanceof RelationPartitioner) { + RelationPartitioner p = (RelationPartitioner) obj; + if (p.numPartitions() == numPartitions()) + return true; + } + return false; + } + } From a6c0faac7064e64b7d122926005de014aea3919e Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 25 Jun 2020 10:48:15 +0200 Subject: [PATCH 27/37] added test to verify secondary sorting --- .../doiboost/DoiBoostMappingUtil.scala | 22 ++ .../dnetlib/doiboost/mag/MagDataModel.scala | 6 +- .../doiboost/mag/SparkPreProcessMAG.scala | 2 +- .../intersection/oozie_app/workflow.xml | 2 +- .../crossref/CrossrefMappingTest.scala | 24 ++ .../dnetlib/doiboost/crossref/orcid_data.json | 271 ++++++++++++++++++ .../oa/provision/SortableRelationKeyTest.java | 41 +++ .../dnetlib/dhp/oa/provision/relations.json | 90 ++++++ 8 files changed, 453 insertions(+), 5 deletions(-) create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/orcid_data.json create mode 100644 dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SortableRelationKeyTest.java create mode 100644 dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/relations.json diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala index 7b21ecda2..1a45defb0 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala @@ -271,6 +271,26 @@ object DoiBoostMappingUtil { } + + def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String): StructuredProperty = { + val sp = new StructuredProperty + sp.setQualifier(createQualifier(classId,className, schemeId, schemeName)) + sp.setValue(value) + sp + + } + + + + def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String, dataInfo: DataInfo): StructuredProperty = { + val sp = new StructuredProperty + sp.setQualifier(createQualifier(classId,className, schemeId, schemeName)) + sp.setValue(value) + sp.setDataInfo(dataInfo) + sp + + } + def createSP(value: String, classId: String, schemeId: String): StructuredProperty = { val sp = new StructuredProperty sp.setQualifier(createQualifier(classId, schemeId)) @@ -279,6 +299,8 @@ object DoiBoostMappingUtil { } + + def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = { val sp = new StructuredProperty sp.setQualifier(createQualifier(classId, schemeId)) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala index 2419f86a3..7bb4686cf 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala @@ -129,16 +129,16 @@ case object ConversionUtil { val fieldOfStudy = item._2 if (fieldOfStudy != null && fieldOfStudy.subjects != null && fieldOfStudy.subjects.nonEmpty) { val p: List[StructuredProperty] = fieldOfStudy.subjects.flatMap(s => { - val s1 = createSP(s.DisplayName, "keyword", "dnet:subject_classification_typologies") + val s1 = createSP(s.DisplayName, "MAG","Microsoft Academic Graph classification", "dnet:subject_classification_typologies", "dnet:subject_classification_typologies") val di = DoiBoostMappingUtil.generateDataInfo(s.Score.toString) var resList: List[StructuredProperty] = List(s1) if (s.MainType.isDefined) { val maintp = s.MainType.get - val s2 = createSP(s.MainType.get, "keyword", "dnet:subject_classification_typologies") + val s2 = createSP(s.MainType.get, "MAG","Microsoft Academic Graph classification", "dnet:subject_classification_typologies", "dnet:subject_classification_typologies") s2.setDataInfo(di) resList = resList ::: List(s2) if (maintp.contains(".")) { - val s3 = createSP(maintp.split("\\.").head, "keyword", "dnet:subject_classification_typologies") + val s3 = createSP(maintp.split("\\.").head, "MAG","Microsoft Academic Graph classification", "dnet:subject_classification_typologies", "dnet:subject_classification_typologies") s3.setDataInfo(di) resList = resList ::: List(s3) } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkPreProcessMAG.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkPreProcessMAG.scala index f3d051bd6..a24f0e6bb 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkPreProcessMAG.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkPreProcessMAG.scala @@ -43,7 +43,7 @@ object SparkPreProcessMAG { val distinctPaper: Dataset[MagPapers] = spark.createDataset(result) distinctPaper.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/Papers_distinct") - logger.info("Phase 6) Enrich Publication with description") + logger.info("Phase 0) Enrich Publication with description") val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract] pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract") diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/intersection/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/intersection/oozie_app/workflow.xml index 34ba5d89d..bf91958cf 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/intersection/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/intersection/oozie_app/workflow.xml @@ -32,7 +32,7 @@ - + diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala index d31f80248..f62ac2b67 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala @@ -18,6 +18,9 @@ class CrossrefMappingTest { val mapper = new ObjectMapper() + + + @Test def testFunderRelationshipsMapping(): Unit = { val template = Source.fromInputStream(getClass.getResourceAsStream("article_funder_template.json")).mkString @@ -58,6 +61,27 @@ class CrossrefMappingTest { } + @Test + def testOrcidID() :Unit = { + val json = Source.fromInputStream(getClass.getResourceAsStream("orcid_data.json")).mkString + + + assertNotNull(json) + assertFalse(json.isEmpty); + + val resultList: List[Oaf] = Crossref2Oaf.convert(json) + + assertTrue(resultList.nonEmpty) + + val items = resultList.filter(p => p.isInstanceOf[Result]) + + + mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) + items.foreach(p => println(mapper.writeValueAsString(p))) + + + } + @Test def testEmptyTitle() :Unit = { val json = Source.fromInputStream(getClass.getResourceAsStream("empty_title.json")).mkString diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/orcid_data.json b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/orcid_data.json new file mode 100644 index 000000000..def546ddb --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/orcid_data.json @@ -0,0 +1,271 @@ +{ + "DOI":"10.1016/j.carbpol.2020.115930", + "issued":{ + "date-parts":[ + [ + 2020, + 4 + ] + ] + }, + "published-print":{ + "date-parts":[ + [ + 2020, + 4 + ] + ] + }, + "prefix":"10.1016", + "subject":[ + "Organic Chemistry", + "Materials Chemistry", + "Polymers and Plastics" + ], + "author":[ + { + "affiliation":[ + + ], + "given":"Lei", + "family":"Fang", + "sequence":"first" + }, + { + "affiliation":[ + + ], + "given":"Hua", + "family":"Lin", + "sequence":"additional" + }, + { + "affiliation":[ + + ], + "given":"Zhenfeng", + "family":"Wu", + "sequence":"additional" + }, + { + "affiliation":[ + + ], + "given":"Zhen", + "family":"Wang", + "sequence":"additional" + }, + { + "affiliation":[ + + ], + "given":"Xinxin", + "family":"Fan", + "sequence":"additional" + }, + { + "affiliation":[ + + ], + "given":"Ziting", + "family":"Cheng", + "sequence":"additional" + }, + { + "affiliation":[ + + ], + "given":"Xiaoya", + "family":"Hou", + "sequence":"additional" + }, + { + "authenticated-orcid":false, + "given":"Daquan", + "family":"Chen", + "sequence":"additional", + "affiliation":[ + + ], + "ORCID":"http://orcid.org/0000-0002-6796-0204" + } + ], + "reference-count":41, + "ISSN":[ + "0144-8617" + ], + "assertion":[ + { + "name":"publisher", + "value":"Elsevier", + "label":"This article is maintained by" + }, + { + "name":"articletitle", + "value":"In vitro/vivo evaluation of novel mitochondrial targeting charge-reversal polysaccharide-based antitumor nanoparticle", + "label":"Article Title" + }, + { + "name":"journaltitle", + "value":"Carbohydrate Polymers", + "label":"Journal Title" + }, + { + "name":"articlelink", + "value":"https://doi.org/10.1016/j.carbpol.2020.115930", + "label":"CrossRef DOI link to publisher maintained version" + }, + { + "name":"content_type", + "value":"article", + "label":"Content Type" + }, + { + "name":"copyright", + "value":"\\u00a9 2020 Elsevier Ltd. All rights reserved.", + "label":"Copyright" + } + ], + "member":"78", + "source":"Crossref", + "score":1.0, + "deposited":{ + "timestamp":1584590965000, + "date-time":"2020-03-19T04:09:25Z", + "date-parts":[ + [ + 2020, + 3, + 19 + ] + ] + }, + "indexed":{ + "timestamp":1584592912467, + "date-time":"2020-03-19T04:41:52Z", + "date-parts":[ + [ + 2020, + 3, + 19 + ] + ] + }, + "type":"journal-article", + "URL":"http://dx.doi.org/10.1016/j.carbpol.2020.115930", + "is-referenced-by-count":0, + "volume":"234", + "issn-type":[ + { + "type":"print", + "value":"0144-8617" + } + ], + "link":[ + { + "URL":"https://api.elsevier.com/content/article/PII:S0144861720301041?httpAccept=text/xml", + "intended-application":"text-mining", + "content-version":"vor", + "content-type":"text/xml" + }, + { + "URL":"https://api.elsevier.com/content/article/PII:S0144861720301041?httpAccept=text/plain", + "intended-application":"text-mining", + "content-version":"vor", + "content-type":"text/plain" + } + ], + "update-policy":"http://dx.doi.org/10.1016/elsevier_cm_policy", + "references-count":41, + "short-container-title":[ + "Carbohydrate Polymers" + ], + "publisher":"Elsevier BV", + "content-domain":{ + "domain":[ + "elsevier.com", + "sciencedirect.com" + ], + "crossmark-restriction":true + }, + "language":"en", + "license":[ + { + "URL":"https://www.elsevier.com/tdm/userlicense/1.0/", + "start":{ + "timestamp":1585699200000, + "date-time":"2020-04-01T00:00:00Z", + "date-parts":[ + [ + 2020, + 4, + 1 + ] + ] + }, + "content-version":"tdm", + "delay-in-days":0 + } + ], + "created":{ + "timestamp":1581759678000, + "date-time":"2020-02-15T09:41:18Z", + "date-parts":[ + [ + 2020, + 2, + 15 + ] + ] + }, + "title":[ + "In vitro/vivo evaluation of novel mitochondrial targeting charge-reversal polysaccharide-based antitumor nanoparticle" + ], + "alternative-id":[ + "S0144861720301041" + ], + "container-title":[ + "Carbohydrate Polymers" + ], + "funder":[ + { + "doi-asserted-by":"publisher", + "DOI":"10.13039/501100007129", + "name":"Natural Science Foundation of Shandong Province", + "award":[ + "ZR2019ZD24", + "ZR2019YQ30" + ] + }, + { + "doi-asserted-by":"publisher", + "DOI":"10.13039/100010449", + "name":"Ministry of Education, Libya", + "award":[ + + ] + }, + { + "doi-asserted-by":"publisher", + "DOI":"10.13039/501100012249", + "name":"Jiangxi University of Traditional Chinese Medicine", + "award":[ + "TCM-0906" + ] + }, + { + "name":"Taishan Scholar Program", + "award":[ + "qnts20161035" + ] + }, + { + "name":"Open fund project of Key Laboratory of Modern Preparation of TCM", + "award":[ + + ] + } + ], + "page":"115930", + "article-number":"115930" +} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SortableRelationKeyTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SortableRelationKeyTest.java new file mode 100644 index 000000000..200800bd8 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SortableRelationKeyTest.java @@ -0,0 +1,41 @@ +package eu.dnetlib.dhp.oa.provision; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey; +import eu.dnetlib.dhp.schema.oaf.Relation; +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.Test; +import java.io.IOException; +import java.util.List; + +public class SortableRelationKeyTest { + + @Test + public void doTesSorting() throws IOException { + final ObjectMapper mapper = new ObjectMapper(); + final String json = IOUtils.toString(this.getClass().getResourceAsStream("relations.json")); + final List relations = mapper.readValue(json, new TypeReference>() { }); + + + relations.stream().map(r -> SortableRelationKey.create(r, r.getSource())).sorted() + .forEach( + + it -> { + try { + System.out.println(mapper.writeValueAsString(it)); + } catch (JsonProcessingException e) { + e.printStackTrace(); + } + }); + + + + + + + } + + +} diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/relations.json b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/relations.json new file mode 100644 index 000000000..3280d0d61 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/relations.json @@ -0,0 +1,90 @@ +[ + { + "collectedfrom": [], + "dataInfo": { + "deletedbyinference": false, + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "Harvested", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "lastupdatetimestamp": 1592688952862, + "properties": [], + "relClass": "hasAuthorInstitution", + "relType": "resultOrganization", + "source": "1", + "subRelType": "affiliation", + "target": "2" + }, + { + "collectedfrom": [], + "dataInfo": { + "deletedbyinference": false, + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "Harvested", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "lastupdatetimestamp": 1592688952862, + "properties": [], + "relClass": "isAuthorInstitutionOf", + "relType": "resultOrganization", + "source": "2", + "subRelType": "affiliation", + "target": "1" + }, + { + "collectedfrom": [], + "dataInfo": { + "deletedbyinference": false, + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "Harvested", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "lastupdatetimestamp": 1592688952862, + "properties": [], + "relClass": "isProducedBy", + "relType": "resultProject", + "source": "1", + "subRelType": "outcome", + "target": "2" + }, + { + "collectedfrom": [], + "dataInfo": { + "deletedbyinference": false, + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "Harvested", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "lastupdatetimestamp": 1592688952862, + "properties": [], + "relClass": "produces", + "relType": "resultProject", + "source": "2", + "subRelType": "outcome", + "target": "1" + } +] \ No newline at end of file From 6933ec11fbf5fecdacbad4f3782c2316a322ea69 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 25 Jun 2020 11:04:12 +0200 Subject: [PATCH 28/37] WIP: prepare relation job --- .../dhp/oa/provision/PrepareRelationsJob.java | 8 ++- .../provision/model/SortableRelationKey.java | 3 +- .../oa/provision/SortableRelationKeyTest.java | 53 ++++++++++--------- 3 files changed, 35 insertions(+), 29 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java index 4ae822df7..cf311c690 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java @@ -7,6 +7,8 @@ import java.io.Serializable; import java.util.*; import java.util.function.Supplier; +import javax.annotation.Nullable; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -21,6 +23,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Predicate; import com.google.common.base.Splitter; import com.google.common.collect.ComparisonChain; import com.google.common.collect.Iterables; @@ -142,7 +145,7 @@ public class PrepareRelationsJob { .repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions)) .groupBy(Tuple2::_1) .map(Tuple2::_2) - .map(t -> Iterables.limit(t, maxRelations)) + .map(t -> Iterables.filter(t, input -> input._1().getSubRelType().equals("outcome"))) .flatMap(Iterable::iterator) .map(Tuple2::_2) @@ -151,7 +154,8 @@ public class PrepareRelationsJob { .repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions)) .groupBy(Tuple2::_1) .map(Tuple2::_2) - .map(t -> Iterables.limit(t, maxRelations)) + .map(t -> Iterables.filter(t, input -> input._1().getSubRelType().equals("outcome"))) + // .map(t -> Iterables.limit(t, maxRelations)) .flatMap(Iterable::iterator) .map(Tuple2::_2) .rdd(); diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java index e96c4ca5c..09a1a9d33 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java @@ -62,8 +62,9 @@ public class SortableRelationKey implements Comparable, Ser public int compareTo(SortableRelationKey o) { return ComparisonChain .start() + .compare(getGroupingKey(), o.getGroupingKey()) .compare(getWeight(this), getWeight(o)) - .result() * -1; + .result(); } private Integer getWeight(SortableRelationKey o) { diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SortableRelationKeyTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SortableRelationKeyTest.java index 200800bd8..72f28fdf2 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SortableRelationKeyTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SortableRelationKeyTest.java @@ -1,41 +1,42 @@ + package eu.dnetlib.dhp.oa.provision; +import java.io.IOException; +import java.util.List; + +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.Test; + import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; + import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey; import eu.dnetlib.dhp.schema.oaf.Relation; -import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Test; -import java.io.IOException; -import java.util.List; public class SortableRelationKeyTest { - @Test - public void doTesSorting() throws IOException { - final ObjectMapper mapper = new ObjectMapper(); - final String json = IOUtils.toString(this.getClass().getResourceAsStream("relations.json")); - final List relations = mapper.readValue(json, new TypeReference>() { }); + @Test + public void doTesSorting() throws IOException { + final ObjectMapper mapper = new ObjectMapper(); + final String json = IOUtils.toString(this.getClass().getResourceAsStream("relations.json")); + final List relations = mapper.readValue(json, new TypeReference>() { + }); + relations + .stream() + .map(r -> SortableRelationKey.create(r, r.getSource())) + .sorted() + .forEach( - relations.stream().map(r -> SortableRelationKey.create(r, r.getSource())).sorted() - .forEach( - - it -> { - try { - System.out.println(mapper.writeValueAsString(it)); - } catch (JsonProcessingException e) { - e.printStackTrace(); - } - }); - - - - - - - } + it -> { + try { + System.out.println(mapper.writeValueAsString(it)); + } catch (JsonProcessingException e) { + e.printStackTrace(); + } + }); + } } From e62333192c80e9a2307239244fe31a01bea6d77b Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 25 Jun 2020 12:22:18 +0200 Subject: [PATCH 29/37] WIP: prepare relation job --- .../dhp/oa/provision/PrepareRelationsJob.java | 58 +++++++++---------- .../provision/model/SortableRelationKey.java | 2 +- 2 files changed, 27 insertions(+), 33 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java index cf311c690..cb1a3b327 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java @@ -1,42 +1,33 @@ package eu.dnetlib.dhp.oa.provision; -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - -import java.io.Serializable; -import java.util.*; -import java.util.function.Supplier; - -import javax.annotation.Nullable; - +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Splitter; +import com.google.common.collect.Iterables; +import com.google.common.collect.Sets; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey; +import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner; +import eu.dnetlib.dhp.schema.oaf.Relation; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.rdd.RDD; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.base.Predicate; -import com.google.common.base.Splitter; -import com.google.common.collect.ComparisonChain; -import com.google.common.collect.Iterables; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey; -import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner; -import eu.dnetlib.dhp.schema.oaf.Relation; import scala.Tuple2; +import java.util.HashSet; +import java.util.Optional; +import java.util.Set; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + /** * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The * operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and @@ -136,32 +127,35 @@ public class PrepareRelationsJob { SparkSession spark, String inputRelationsPath, String outputPath, Set relationFilter, int maxRelations, int relPartitions) { - RDD cappedRels = readPathRelationRDD(spark, inputRelationsPath) + // group by SOURCE and apply limit + RDD bySource = readPathRelationRDD(spark, inputRelationsPath) .filter(rel -> rel.getDataInfo().getDeletedbyinference() == false) .filter(rel -> relationFilter.contains(rel.getRelClass()) == false) - - // group by SOURCE and apply limit .mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, r.getSource()), r)) .repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions)) .groupBy(Tuple2::_1) .map(Tuple2::_2) - .map(t -> Iterables.filter(t, input -> input._1().getSubRelType().equals("outcome"))) + .map(t -> Iterables.limit(t, maxRelations)) .flatMap(Iterable::iterator) .map(Tuple2::_2) + .rdd(); - // group by TARGET and apply limit + // group by TARGET and apply limit + RDD byTarget = readPathRelationRDD(spark, inputRelationsPath) + .filter(rel -> rel.getDataInfo().getDeletedbyinference() == false) + .filter(rel -> relationFilter.contains(rel.getRelClass()) == false) .mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, r.getTarget()), r)) .repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions)) .groupBy(Tuple2::_1) .map(Tuple2::_2) - .map(t -> Iterables.filter(t, input -> input._1().getSubRelType().equals("outcome"))) - // .map(t -> Iterables.limit(t, maxRelations)) + .map(t -> Iterables.limit(t, maxRelations)) .flatMap(Iterable::iterator) .map(Tuple2::_2) .rdd(); spark - .createDataset(cappedRels, Encoders.bean(Relation.class)) + .createDataset(bySource.union(byTarget), Encoders.bean(Relation.class)) + .repartition(relPartitions) .write() .mode(SaveMode.Overwrite) .parquet(outputPath); diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java index 09a1a9d33..bf7f9330d 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java @@ -50,7 +50,7 @@ public class SortableRelationKey implements Comparable, Ser if (o == null || getClass() != o.getClass()) return false; SortableRelationKey that = (SortableRelationKey) o; - return Objects.equal(getGroupingKey(), that.getGroupingKey()); + return getGroupingKey().equals(that.getGroupingKey()); } @Override From 05a99cfb61383f4db8cc901533f1d04d07ca3850 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 25 Jun 2020 12:36:08 +0200 Subject: [PATCH 30/37] change the position of value and description elements in the workflow definition --- .../eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml index 3bf3cc7a7..d8b61b5ea 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml @@ -28,8 +28,8 @@ dbSchema - the database schema according to the D-Net infrastructure (beta or production) beta + the database schema according to the D-Net infrastructure (beta or production) mongoURL From 93f627ea510632d74050abc3fdab0977e8ce21ea Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 25 Jun 2020 12:54:21 +0200 Subject: [PATCH 31/37] code formatting --- .../dhp/oa/provision/PrepareRelationsJob.java | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java index cb1a3b327..19823120c 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java @@ -1,15 +1,12 @@ package eu.dnetlib.dhp.oa.provision; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.base.Splitter; -import com.google.common.collect.Iterables; -import com.google.common.collect.Sets; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey; -import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner; -import eu.dnetlib.dhp.schema.oaf.Relation; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.HashSet; +import java.util.Optional; +import java.util.Set; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -20,14 +17,19 @@ import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Splitter; +import com.google.common.collect.Iterables; +import com.google.common.collect.Sets; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey; +import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner; +import eu.dnetlib.dhp.schema.oaf.Relation; import scala.Tuple2; -import java.util.HashSet; -import java.util.Optional; -import java.util.Set; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - /** * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The * operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and From 216975c4ecad5cbe48533b75af9ba714ece4dab0 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 25 Jun 2020 12:55:52 +0200 Subject: [PATCH 32/37] restored complete provision workflow --- .../dhp/oa/provision/oozie_app/workflow.xml | 480 +++++++++++++++++- 1 file changed, 477 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml index e98cbbc73..0d5121cf1 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml @@ -97,7 +97,18 @@ - + + + + + ${wf:conf('resumeFrom') eq 'prepare_relations'} + ${wf:conf('resumeFrom') eq 'fork_join_related_entities'} + ${wf:conf('resumeFrom') eq 'fork_join_all_entities'} + ${wf:conf('resumeFrom') eq 'convert_to_xml'} + ${wf:conf('resumeFrom') eq 'to_solr_index'} + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -124,12 +135,475 @@ --outputPath${workingDir}/relation --relPartitions5000 + + + + + + + + + + + + + + + + + yarn + cluster + Join[relation.target = publication.id] + eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1 + dhp-graph-provision-${projectVersion}.jar + + --executor-cores=${sparkExecutorCoresForJoining} + --executor-memory=${sparkExecutorMemoryForJoining} + --driver-memory=${sparkDriverMemoryForJoining} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + --conf spark.network.timeout=${sparkNetworkTimeout} + + --inputRelationsPath${workingDir}/relation + --inputEntityPath${inputGraphRootPath}/publication + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication + --outputPath${workingDir}/join_partial/publication + + + + + + + + yarn + cluster + Join[relation.target = dataset.id] + eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1 + dhp-graph-provision-${projectVersion}.jar + + --executor-cores=${sparkExecutorCoresForJoining} + --executor-memory=${sparkExecutorMemoryForJoining} + --driver-memory=${sparkDriverMemoryForJoining} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.network.timeout=${sparkNetworkTimeout} + + --inputRelationsPath${workingDir}/relation + --inputEntityPath${inputGraphRootPath}/dataset + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset + --outputPath${workingDir}/join_partial/dataset + + + + + + + + yarn + cluster + Join[relation.target = otherresearchproduct.id] + eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1 + dhp-graph-provision-${projectVersion}.jar + + --executor-cores=${sparkExecutorCoresForJoining} + --executor-memory=${sparkExecutorMemoryForJoining} + --driver-memory=${sparkDriverMemoryForJoining} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.network.timeout=${sparkNetworkTimeout} + + --inputRelationsPath${workingDir}/relation + --inputEntityPath${inputGraphRootPath}/otherresearchproduct + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --outputPath${workingDir}/join_partial/otherresearchproduct + + + + + + + + yarn + cluster + Join[relation.target = software.id] + eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1 + dhp-graph-provision-${projectVersion}.jar + + --executor-cores=${sparkExecutorCoresForJoining} + --executor-memory=${sparkExecutorMemoryForJoining} + --driver-memory=${sparkDriverMemoryForJoining} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.network.timeout=${sparkNetworkTimeout} + + --inputRelationsPath${workingDir}/relation + --inputEntityPath${inputGraphRootPath}/software + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software + --outputPath${workingDir}/join_partial/software + + + + + + + + yarn + cluster + Join[relation.target = datasource.id] + eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1 + dhp-graph-provision-${projectVersion}.jar + + --executor-cores=${sparkExecutorCoresForJoining} + --executor-memory=${sparkExecutorMemoryForJoining} + --driver-memory=${sparkDriverMemoryForJoining} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.network.timeout=${sparkNetworkTimeout} + + --inputRelationsPath${workingDir}/relation + --inputEntityPath${inputGraphRootPath}/datasource + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Datasource + --outputPath${workingDir}/join_partial/datasource + + + + + + + + yarn + cluster + Join[relation.target = organization.id] + eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1 + dhp-graph-provision-${projectVersion}.jar + + --executor-cores=${sparkExecutorCoresForJoining} + --executor-memory=${sparkExecutorMemoryForJoining} + --driver-memory=${sparkDriverMemoryForJoining} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.network.timeout=${sparkNetworkTimeout} + + --inputRelationsPath${workingDir}/relation + --inputEntityPath${inputGraphRootPath}/organization + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Organization + --outputPath${workingDir}/join_partial/organization + + + + + + + + yarn + cluster + Join[relation.target = project.id] + eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1 + dhp-graph-provision-${projectVersion}.jar + + --executor-cores=${sparkExecutorCoresForJoining} + --executor-memory=${sparkExecutorMemoryForJoining} + --driver-memory=${sparkDriverMemoryForJoining} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.network.timeout=${sparkNetworkTimeout} + + --inputRelationsPath${workingDir}/relation + --inputEntityPath${inputGraphRootPath}/project + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Project + --outputPath${workingDir}/join_partial/project + + + + + + + + + + + + + + + + + + + + yarn + cluster + Join[publication.id = relatedEntity.source] + eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2 + dhp-graph-provision-${projectVersion}.jar + + --executor-cores=${sparkExecutorCoresForJoining} + --executor-memory=${sparkExecutorMemoryForJoining} + --driver-memory=${sparkDriverMemoryForJoining} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=15360 + --conf spark.network.timeout=${sparkNetworkTimeout} + + --inputEntityPath${inputGraphRootPath}/publication + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication + --inputRelatedEntitiesPath${workingDir}/join_partial + --outputPath${workingDir}/join_entities/publication + --numPartitions30000 + + + + + + + + yarn + cluster + Join[dataset.id = relatedEntity.source] + eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2 + dhp-graph-provision-${projectVersion}.jar + + --executor-cores=${sparkExecutorCoresForJoining} + --executor-memory=${sparkExecutorMemoryForJoining} + --driver-memory=${sparkDriverMemoryForJoining} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + --conf spark.network.timeout=${sparkNetworkTimeout} + + --inputEntityPath${inputGraphRootPath}/dataset + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset + --inputRelatedEntitiesPath${workingDir}/join_partial + --outputPath${workingDir}/join_entities/dataset + --numPartitions20000 + + + + + + + + yarn + cluster + Join[otherresearchproduct.id = relatedEntity.source] + eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2 + dhp-graph-provision-${projectVersion}.jar + + --executor-cores=${sparkExecutorCoresForJoining} + --executor-memory=${sparkExecutorMemoryForJoining} + --driver-memory=${sparkDriverMemoryForJoining} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + --conf spark.network.timeout=${sparkNetworkTimeout} + + --inputEntityPath${inputGraphRootPath}/otherresearchproduct + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --inputRelatedEntitiesPath${workingDir}/join_partial + --outputPath${workingDir}/join_entities/otherresearchproduct + --numPartitions10000 + + + + + + + + yarn + cluster + Join[software.id = relatedEntity.source] + eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2 + dhp-graph-provision-${projectVersion}.jar + + --executor-cores=${sparkExecutorCoresForJoining} + --executor-memory=${sparkExecutorMemoryForJoining} + --driver-memory=${sparkDriverMemoryForJoining} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.network.timeout=${sparkNetworkTimeout} + + --inputEntityPath${inputGraphRootPath}/software + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software + --inputRelatedEntitiesPath${workingDir}/join_partial + --outputPath${workingDir}/join_entities/software + --numPartitions10000 + + + + + + + + yarn + cluster + Join[datasource.id = relatedEntity.source] + eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2 + dhp-graph-provision-${projectVersion}.jar + + --executor-cores=${sparkExecutorCoresForJoining} + --executor-memory=${sparkExecutorMemoryForJoining} + --driver-memory=${sparkDriverMemoryForJoining} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + --conf spark.network.timeout=${sparkNetworkTimeout} + + --inputEntityPath${inputGraphRootPath}/datasource + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Datasource + --inputRelatedEntitiesPath${workingDir}/join_partial + --outputPath${workingDir}/join_entities/datasource + --numPartitions1000 + + + + + + + + yarn + cluster + Join[organization.id = relatedEntity.source] + eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2 + dhp-graph-provision-${projectVersion}.jar + + --executor-cores=${sparkExecutorCoresForJoining} + --executor-memory=${sparkExecutorMemoryForJoining} + --driver-memory=${sparkDriverMemoryForJoining} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + --conf spark.network.timeout=${sparkNetworkTimeout} + + --inputEntityPath${inputGraphRootPath}/organization + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Organization + --inputRelatedEntitiesPath${workingDir}/join_partial + --outputPath${workingDir}/join_entities/organization + --numPartitions20000 + + + + + + + + yarn + cluster + Join[project.id = relatedEntity.source] + eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2 + dhp-graph-provision-${projectVersion}.jar + + --executor-cores=${sparkExecutorCoresForJoining} + --executor-memory=${sparkExecutorMemoryForJoining} + --driver-memory=${sparkDriverMemoryForJoining} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.network.timeout=${sparkNetworkTimeout} + + --inputEntityPath${inputGraphRootPath}/project + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Project + --inputRelatedEntitiesPath${workingDir}/join_partial + --outputPath${workingDir}/join_entities/project + --numPartitions10000 + + + + + + + + + + yarn + cluster + convert_to_xml + eu.dnetlib.dhp.oa.provision.XmlConverterJob + dhp-graph-provision-${projectVersion}.jar + + --executor-cores=${sparkExecutorCoresForJoining} + --executor-memory=${sparkExecutorMemoryForJoining} + --driver-memory=${sparkDriverMemoryForJoining} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.network.timeout=${sparkNetworkTimeout} + + --inputPath${workingDir}/join_entities + --outputPath${workingDir}/xml + --isLookupUrl${isLookupUrl} + --otherDsTypeId${otherDsTypeId} + + + + + + + + yarn + cluster + to_solr_index + eu.dnetlib.dhp.oa.provision.XmlIndexingJob + dhp-graph-provision-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemoryForIndexing} + --driver-memory=${sparkDriverMemoryForIndexing} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForIndexing} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.speculation=false + --conf spark.hadoop.mapreduce.map.speculative=false + --conf spark.hadoop.mapreduce.reduce.speculative=false + + --inputPath${workingDir}/xml + --isLookupUrl${isLookupUrl} + --format${format} + --batchSize${batchSize} + - - \ No newline at end of file From e28033c6d886d81bfc3c4be0c388e998b2708430 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Thu, 25 Jun 2020 13:01:09 +0200 Subject: [PATCH 33/37] some fixes --- .../eu/dnetlib/dhp/broker/model/EventFactory.java | 6 +++--- .../dhp/broker/oa/matchers/UpdateMatcher.java | 14 ++++++++------ .../oa/matchers/simple/EnrichMoreSubject.java | 1 + .../eu/dnetlib/dhp/broker/oa/util/EventGroup.java | 6 +++++- .../eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java | 2 +- 5 files changed, 18 insertions(+), 11 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java index 6e38f7448..b88befbe7 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java @@ -33,7 +33,7 @@ public class EventFactory { final Map map = createMapFromResult(updateInfo); final String eventId = calculateEventId( - updateInfo.getTopicPath(), updateInfo.getTarget().getOriginalId(), updateInfo.getHighlightValueAsString()); + updateInfo.getTopicPath(), updateInfo.getTarget().getOpenaireId(), updateInfo.getHighlightValueAsString()); res.setEventId(eventId); res.setProducerId(PRODUCER_ID); @@ -55,7 +55,7 @@ public class EventFactory { map.put("target_datasource_id", target.getCollectedFromId()); map.put("target_datasource_name", target.getCollectedFromName()); - map.put("target_publication_id", target.getOriginalId()); + map.put("target_publication_id", target.getOpenaireId()); final List titles = target.getTitles(); if (titles.size() > 0) { @@ -74,7 +74,7 @@ public class EventFactory { map.put("trust", updateInfo.getTrust()); map.put("provenance_datasource_id", source.getCollectedFromId()); map.put("provenance_datasource_name", source.getCollectedFromName()); - map.put("provenance_publication_id_list", source.getOriginalId()); + map.put("provenance_publication_id_list", source.getOpenaireId()); return map; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java index 4691ed65e..c0287bda0 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java @@ -43,13 +43,15 @@ public abstract class UpdateMatcher { if (source != res) { for (final T hl : findDifferences(source, res)) { final Topic topic = getTopicFunction().apply(hl); - final UpdateInfo info = new UpdateInfo<>(topic, hl, source, res, getCompileHighlightFunction(), - getHighlightToStringFunction(), dedupConfig); + if (topic != null) { + final UpdateInfo info = new UpdateInfo<>(topic, hl, source, res, + getCompileHighlightFunction(), + getHighlightToStringFunction(), dedupConfig); - final String s = DigestUtils.md5Hex(info.getHighlightValueAsString()); - if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) { - } else { - infoMap.put(s, info); + final String s = DigestUtils.md5Hex(info.getHighlightValueAsString()); + if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) { + infoMap.put(s, info); + } } } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java index 04fb494ef..97b289b69 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java @@ -22,6 +22,7 @@ public class EnrichMoreSubject extends UpdateMatcher { @Override protected List findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) { + final Set existingSubjects = target .getSubjects() .stream() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventGroup.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventGroup.java index 25c7698a0..503e31ae1 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventGroup.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventGroup.java @@ -14,12 +14,16 @@ public class EventGroup implements Serializable { */ private static final long serialVersionUID = 765977943803533130L; - private final List data = new ArrayList<>(); + private List data = new ArrayList<>(); public List getData() { return data; } + public void setData(final List data) { + this.data = data; + } + public EventGroup addElement(final Event elem) { data.add(elem); return this; diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java index 25d0d2bca..048683b50 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java @@ -111,7 +111,7 @@ public final class UpdateInfo { final OaBrokerMainEntity hl = new OaBrokerMainEntity(); compileHighlight.accept(hl, getHighlightValue()); - final String provId = getSource().getOriginalId(); + final String provId = getSource().getOpenaireId(); final String provRepo = getSource().getCollectedFromName(); final String provUrl = getSource() From 6f5771c1c997da598fc34aeb3ce421d939997428 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 25 Jun 2020 14:06:21 +0200 Subject: [PATCH 34/37] sets author.rank when null --- .../dhp/oa/graph/clean/CleanGraphSparkJob.java | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java index bdbd64160..e1c4b53b5 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java @@ -152,7 +152,18 @@ public class CleanGraphSparkJob { } } } - + if (Objects.nonNull(r.getAuthor())) { + boolean nullRank = r + .getAuthor() + .stream() + .anyMatch(a -> Objects.isNull(a.getRank())); + if (nullRank) { + int i = 1; + for (Author author : r.getAuthor()) { + author.setRank(i++); + } + } + } if (value instanceof Publication) { } else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) { From 4eb3e109d78a801c7ad8cb8de75e13864a3ff5d9 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Thu, 25 Jun 2020 15:45:50 +0200 Subject: [PATCH 35/37] compilation of event map --- .../eu/dnetlib/dhp/broker/model/Event.java | 9 +- .../dhp/broker/model/EventFactory.java | 31 ++-- .../dhp/broker/model/MappedFields.java | 137 ++++++++++++++++++ .../dhp/broker/oa/util/EventFinder.java | 70 +++------ 4 files changed, 180 insertions(+), 67 deletions(-) create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/MappedFields.java diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Event.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Event.java index f94d286e4..18950d98e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Event.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Event.java @@ -2,7 +2,6 @@ package eu.dnetlib.dhp.broker.model; import java.io.Serializable; -import java.util.Map; public class Event implements Serializable { @@ -25,7 +24,7 @@ public class Event implements Serializable { private boolean instantMessage; - private Map map; + private MappedFields map; public Event() { } @@ -33,7 +32,7 @@ public class Event implements Serializable { public Event(final String producerId, final String eventId, final String topic, final String payload, final Long creationDate, final Long expiryDate, final boolean instantMessage, - final Map map) { + final MappedFields map) { this.producerId = producerId; this.eventId = eventId; this.topic = topic; @@ -100,11 +99,11 @@ public class Event implements Serializable { this.instantMessage = instantMessage; } - public Map getMap() { + public MappedFields getMap() { return this.map; } - public void setMap(final Map map) { + public void setMap(final MappedFields map) { this.map = map; } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java index b88befbe7..315a054d3 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java @@ -3,9 +3,8 @@ package eu.dnetlib.dhp.broker.model; import java.text.ParseException; import java.util.Date; -import java.util.HashMap; import java.util.List; -import java.util.Map; +import java.util.stream.Collectors; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; @@ -30,7 +29,7 @@ public class EventFactory { final Event res = new Event(); - final Map map = createMapFromResult(updateInfo); + final MappedFields map = createMapFromResult(updateInfo); final String eventId = calculateEventId( updateInfo.getTopicPath(), updateInfo.getTarget().getOpenaireId(), updateInfo.getHighlightValueAsString()); @@ -46,35 +45,35 @@ public class EventFactory { return res; } - private static Map createMapFromResult(final UpdateInfo updateInfo) { - final Map map = new HashMap<>(); + private static MappedFields createMapFromResult(final UpdateInfo updateInfo) { + final MappedFields map = new MappedFields(); final OaBrokerMainEntity source = updateInfo.getSource(); final OaBrokerMainEntity target = updateInfo.getTarget(); - map.put("target_datasource_id", target.getCollectedFromId()); - map.put("target_datasource_name", target.getCollectedFromName()); + map.setTargetDatasourceId(target.getCollectedFromId()); + map.setTargetDatasourceName(target.getCollectedFromName()); - map.put("target_publication_id", target.getOpenaireId()); + map.setTargetResultId(target.getOpenaireId()); final List titles = target.getTitles(); if (titles.size() > 0) { - map.put("target_publication_title", titles.get(0)); + map.setTargetResultTitle(titles.get(0)); } final long date = parseDateTolong(target.getPublicationdate()); if (date > 0) { - map.put("target_dateofacceptance", date); + map.setTargetDateofacceptance(date); } - map.put("target_publication_subject_list", target.getSubjects()); - map.put("target_publication_author_list", target.getCreators()); + map.setTargetSubjects(target.getSubjects().stream().map(s -> s.getValue()).collect(Collectors.toList())); + map.setTargetAuthors(target.getCreators().stream().map(a -> a.getFullname()).collect(Collectors.toList())); // PROVENANCE INFO - map.put("trust", updateInfo.getTrust()); - map.put("provenance_datasource_id", source.getCollectedFromId()); - map.put("provenance_datasource_name", source.getCollectedFromName()); - map.put("provenance_publication_id_list", source.getOpenaireId()); + map.setTrust(updateInfo.getTrust()); + map.setProvenanceDatasourceId(source.getCollectedFromId()); + map.setProvenanceDatasourceName(source.getCollectedFromName()); + map.setProvenanceResultId(source.getOpenaireId()); return map; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/MappedFields.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/MappedFields.java new file mode 100644 index 000000000..22a878e29 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/MappedFields.java @@ -0,0 +1,137 @@ + +package eu.dnetlib.dhp.broker.model; + +import java.io.Serializable; +import java.util.List; + +import org.codehaus.jackson.annotate.JsonProperty; + +public class MappedFields implements Serializable { + + /** + * + */ + private static final long serialVersionUID = -7999704113195802008L; + + @JsonProperty("target_datasource_id") + private String targetDatasourceId; + + @JsonProperty("target_datasource_name") + private String targetDatasourceName; + + @JsonProperty("target_result_id") + private String targetResultId; + + @JsonProperty("target_result_title") + private String targetResultTitle; + + @JsonProperty("target_dateofacceptance") + private long targetDateofacceptance; + + @JsonProperty("target_result_subject_list") + private List targetSubjects; + + @JsonProperty("target_result_author_list") + private List targetAuthors; + + @JsonProperty("trust") + private float trust; + + @JsonProperty("provenance_datasource_id") + private String provenanceDatasourceId; + + @JsonProperty("provenance_datasource_name") + private String provenanceDatasourceName; + + @JsonProperty("setProvenanceResultId") + private String provenanceResultId; + + public String getTargetDatasourceId() { + return targetDatasourceId; + } + + public void setTargetDatasourceId(final String targetDatasourceId) { + this.targetDatasourceId = targetDatasourceId; + } + + public String getTargetDatasourceName() { + return targetDatasourceName; + } + + public void setTargetDatasourceName(final String targetDatasourceName) { + this.targetDatasourceName = targetDatasourceName; + } + + public String getTargetResultId() { + return targetResultId; + } + + public void setTargetResultId(final String targetResultId) { + this.targetResultId = targetResultId; + } + + public String getTargetResultTitle() { + return targetResultTitle; + } + + public void setTargetResultTitle(final String targetResultTitle) { + this.targetResultTitle = targetResultTitle; + } + + public long getTargetDateofacceptance() { + return targetDateofacceptance; + } + + public void setTargetDateofacceptance(final long targetDateofacceptance) { + this.targetDateofacceptance = targetDateofacceptance; + } + + public List getTargetSubjects() { + return targetSubjects; + } + + public void setTargetSubjects(final List targetSubjects) { + this.targetSubjects = targetSubjects; + } + + public List getTargetAuthors() { + return targetAuthors; + } + + public void setTargetAuthors(final List targetAuthors) { + this.targetAuthors = targetAuthors; + } + + public float getTrust() { + return trust; + } + + public void setTrust(final float trust) { + this.trust = trust; + } + + public String getProvenanceDatasourceId() { + return provenanceDatasourceId; + } + + public void setProvenanceDatasourceId(final String provenanceDatasourceId) { + this.provenanceDatasourceId = provenanceDatasourceId; + } + + public String getProvenanceDatasourceName() { + return provenanceDatasourceName; + } + + public void setProvenanceDatasourceName(final String provenanceDatasourceName) { + this.provenanceDatasourceName = provenanceDatasourceName; + } + + public String getProvenanceResultId() { + return provenanceResultId; + } + + public void setProvenanceResultId(final String provenanceResultId) { + this.provenanceResultId = provenanceResultId; + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java index 7451e5891..1a3f514e8 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java @@ -7,29 +7,7 @@ import java.util.List; import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.EventFactory; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsReferencedBy; -import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsRelatedTo; -import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedBy; -import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedTo; -import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetReferences; -import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMissingProject; -import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMoreProject; -import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsReferencedBy; -import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsRelatedTo; -import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedBy; -import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedTo; -import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationReferences; -import eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware.EnrichMissingSoftware; -import eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware.EnrichMoreSoftware; import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAbstract; -import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAuthorOrcid; -import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingOpenAccess; -import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPid; -import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate; -import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingSubject; -import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreOpenAccess; -import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMorePid; -import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSubject; import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup; import eu.dnetlib.pace.config.DedupConfig; @@ -38,31 +16,31 @@ public class EventFinder { private static List> matchers = new ArrayList<>(); static { matchers.add(new EnrichMissingAbstract()); - matchers.add(new EnrichMissingAuthorOrcid()); - matchers.add(new EnrichMissingOpenAccess()); - matchers.add(new EnrichMissingPid()); - matchers.add(new EnrichMissingPublicationDate()); - matchers.add(new EnrichMissingSubject()); - matchers.add(new EnrichMoreOpenAccess()); - matchers.add(new EnrichMorePid()); - matchers.add(new EnrichMoreSubject()); + // matchers.add(new EnrichMissingAuthorOrcid()); + // matchers.add(new EnrichMissingOpenAccess()); + // matchers.add(new EnrichMissingPid()); + // matchers.add(new EnrichMissingPublicationDate()); + // matchers.add(new EnrichMissingSubject()); + // matchers.add(new EnrichMoreOpenAccess()); + // matchers.add(new EnrichMorePid()); + // matchers.add(new EnrichMoreSubject()); - // Advanced matchers - matchers.add(new EnrichMissingProject()); - matchers.add(new EnrichMoreProject()); - matchers.add(new EnrichMissingSoftware()); - matchers.add(new EnrichMoreSoftware()); - matchers.add(new EnrichMissingPublicationIsRelatedTo()); - matchers.add(new EnrichMissingPublicationIsReferencedBy()); - matchers.add(new EnrichMissingPublicationReferences()); - matchers.add(new EnrichMissingPublicationIsSupplementedTo()); - matchers.add(new EnrichMissingPublicationIsSupplementedBy()); - matchers.add(new EnrichMissingDatasetIsRelatedTo()); - matchers.add(new EnrichMissingDatasetIsReferencedBy()); - matchers.add(new EnrichMissingDatasetReferences()); - matchers.add(new EnrichMissingDatasetIsSupplementedTo()); - matchers.add(new EnrichMissingDatasetIsSupplementedBy()); - matchers.add(new EnrichMissingAbstract()); + // // Advanced matchers + // matchers.add(new EnrichMissingProject()); + // matchers.add(new EnrichMoreProject()); + // matchers.add(new EnrichMissingSoftware()); + // matchers.add(new EnrichMoreSoftware()); + // matchers.add(new EnrichMissingPublicationIsRelatedTo()); + // matchers.add(new EnrichMissingPublicationIsReferencedBy()); + // matchers.add(new EnrichMissingPublicationReferences()); + // matchers.add(new EnrichMissingPublicationIsSupplementedTo()); + // matchers.add(new EnrichMissingPublicationIsSupplementedBy()); + // matchers.add(new EnrichMissingDatasetIsRelatedTo()); + // matchers.add(new EnrichMissingDatasetIsReferencedBy()); + // matchers.add(new EnrichMissingDatasetReferences()); + // matchers.add(new EnrichMissingDatasetIsSupplementedTo()); + // matchers.add(new EnrichMissingDatasetIsSupplementedBy()); + // matchers.add(new EnrichMissingAbstract()); } public static EventGroup generateEvents(final ResultGroup results, final DedupConfig dedupConfig) { From e8fb305f181b21c96d866ef8075c7660a512e9b3 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Thu, 25 Jun 2020 15:53:20 +0200 Subject: [PATCH 36/37] compilation of event map --- .../dhp/broker/model/MappedFields.java | 23 ------------------- 1 file changed, 23 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/MappedFields.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/MappedFields.java index 22a878e29..4b0ed171b 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/MappedFields.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/MappedFields.java @@ -4,8 +4,6 @@ package eu.dnetlib.dhp.broker.model; import java.io.Serializable; import java.util.List; -import org.codehaus.jackson.annotate.JsonProperty; - public class MappedFields implements Serializable { /** @@ -13,37 +11,16 @@ public class MappedFields implements Serializable { */ private static final long serialVersionUID = -7999704113195802008L; - @JsonProperty("target_datasource_id") private String targetDatasourceId; - - @JsonProperty("target_datasource_name") private String targetDatasourceName; - - @JsonProperty("target_result_id") private String targetResultId; - - @JsonProperty("target_result_title") private String targetResultTitle; - - @JsonProperty("target_dateofacceptance") private long targetDateofacceptance; - - @JsonProperty("target_result_subject_list") private List targetSubjects; - - @JsonProperty("target_result_author_list") private List targetAuthors; - - @JsonProperty("trust") private float trust; - - @JsonProperty("provenance_datasource_id") private String provenanceDatasourceId; - - @JsonProperty("provenance_datasource_name") private String provenanceDatasourceName; - - @JsonProperty("setProvenanceResultId") private String provenanceResultId; public String getTargetDatasourceId() { From 2393d9da2f376890cd9fa62936aaef97b6580c8e Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Fri, 26 Jun 2020 11:20:45 +0200 Subject: [PATCH 37/37] limits --- .../dhp/broker/oa/matchers/UpdateMatcher.java | 33 ++++++++++--------- .../AbstractEnrichMissingDataset.java | 2 +- .../relatedProjects/EnrichMissingProject.java | 2 +- .../relatedProjects/EnrichMoreProject.java | 2 +- .../AbstractEnrichMissingPublication.java | 2 +- .../EnrichMissingSoftware.java | 2 +- .../relatedSoftware/EnrichMoreSoftware.java | 2 +- .../simple/EnrichMissingAbstract.java | 2 +- .../simple/EnrichMissingAuthorOrcid.java | 2 +- .../simple/EnrichMissingOpenAccess.java | 2 +- .../oa/matchers/simple/EnrichMissingPid.java | 2 +- .../simple/EnrichMissingPublicationDate.java | 2 +- .../matchers/simple/EnrichMissingSubject.java | 2 +- .../matchers/simple/EnrichMoreOpenAccess.java | 2 +- .../oa/matchers/simple/EnrichMorePid.java | 2 +- .../oa/matchers/simple/EnrichMoreSubject.java | 2 +- .../dhp/broker/oa/util/EventFinder.java | 27 ++++++++++----- 17 files changed, 51 insertions(+), 39 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java index c0287bda0..7f82f9a2b 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java @@ -1,13 +1,14 @@ package eu.dnetlib.dhp.broker.oa.matchers; -import java.util.Arrays; +import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.function.BiConsumer; import java.util.function.Function; +import java.util.stream.Collectors; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; @@ -19,15 +20,15 @@ import eu.dnetlib.pace.config.DedupConfig; public abstract class UpdateMatcher { - private final boolean multipleUpdate; + private final int maxNumber; private final Function topicFunction; private final BiConsumer compileHighlightFunction; private final Function highlightToStringFunction; - public UpdateMatcher(final boolean multipleUpdate, final Function topicFunction, + public UpdateMatcher(final int maxNumber, final Function topicFunction, final BiConsumer compileHighlightFunction, final Function highlightToStringFunction) { - this.multipleUpdate = multipleUpdate; + this.maxNumber = maxNumber; this.topicFunction = topicFunction; this.compileHighlightFunction = compileHighlightFunction; this.highlightToStringFunction = highlightToStringFunction; @@ -57,17 +58,19 @@ public abstract class UpdateMatcher { } } - final Collection> values = infoMap.values(); + final List> values = infoMap + .values() + .stream() + .sorted((o1, o2) -> Float.compare(o2.getTrust(), o1.getTrust())) // DESCENDING + .collect(Collectors.toList()); - if (values.isEmpty() || multipleUpdate) { - return values; + if (values.isEmpty()) { + return new ArrayList<>(); + } else if (values.size() > maxNumber) { + System.err.println("Too many events (" + values.size() + ") matched by " + getClass().getSimpleName()); + return values.subList(0, maxNumber); } else { - final UpdateInfo v = values - .stream() - .sorted((o1, o2) -> Float.compare(o1.getTrust(), o2.getTrust())) - .findFirst() - .get(); - return Arrays.asList(v); + return values; } } @@ -81,8 +84,8 @@ public abstract class UpdateMatcher { return StringUtils.isBlank(field); } - public boolean isMultipleUpdate() { - return multipleUpdate; + public int getMaxNumber() { + return maxNumber; } public Function getTopicFunction() { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java index c8b93596a..f21c1c7b3 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java @@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; public abstract class AbstractEnrichMissingDataset extends UpdateMatcher { public AbstractEnrichMissingDataset(final Topic topic) { - super(true, + super(10, rel -> topic, (p, rel) -> p.getDatasets().add(rel), rel -> rel.getOpenaireId()); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java index 49c546bba..4b563d381 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java @@ -12,7 +12,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; public class EnrichMissingProject extends UpdateMatcher { public EnrichMissingProject() { - super(true, + super(20, prj -> Topic.ENRICH_MISSING_PROJECT, (p, prj) -> p.getProjects().add(prj), prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode()); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java index 6954a3fb5..85b2cbe28 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java @@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; public class EnrichMoreProject extends UpdateMatcher { public EnrichMoreProject() { - super(true, + super(20, prj -> Topic.ENRICH_MORE_PROJECT, (p, prj) -> p.getProjects().add(prj), prj -> projectAsString(prj)); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java index cc4f68f87..f951131b1 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java @@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; public abstract class AbstractEnrichMissingPublication extends UpdateMatcher { public AbstractEnrichMissingPublication(final Topic topic) { - super(true, + super(10, rel -> topic, (p, rel) -> p.getPublications().add(rel), rel -> rel.getOpenaireId()); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java index d01f0c370..a638024bc 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java @@ -13,7 +13,7 @@ public class EnrichMissingSoftware extends UpdateMatcher { public EnrichMissingSoftware() { - super(true, + super(10, s -> Topic.ENRICH_MISSING_SOFTWARE, (p, s) -> p.getSoftwares().add(s), s -> s.getOpenaireId()); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java index a612b6074..2bc370187 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java @@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; public class EnrichMoreSoftware extends UpdateMatcher { public EnrichMoreSoftware() { - super(true, + super(10, s -> Topic.ENRICH_MORE_SOFTWARE, (p, s) -> p.getSoftwares().add(s), s -> s.getOpenaireId()); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java index 73462bae8..b61696e45 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java @@ -12,7 +12,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; public class EnrichMissingAbstract extends UpdateMatcher { public EnrichMissingAbstract() { - super(false, + super(1, s -> Topic.ENRICH_MISSING_ABSTRACT, (p, s) -> p.getAbstracts().add(s), s -> s); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java index 2a01188a9..7bbc43fe3 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java @@ -15,7 +15,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; public class EnrichMissingAuthorOrcid extends UpdateMatcher { public EnrichMissingAuthorOrcid() { - super(true, + super(40, aut -> Topic.ENRICH_MISSING_AUTHOR_ORCID, (p, aut) -> p.getCreators().add(aut), aut -> aut.getOrcid()); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java index 487382957..41a00dcd1 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java @@ -14,7 +14,7 @@ import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; public class EnrichMissingOpenAccess extends UpdateMatcher { public EnrichMissingOpenAccess() { - super(true, + super(20, i -> Topic.ENRICH_MISSING_OA_VERSION, (p, i) -> p.getInstances().add(i), OaBrokerInstance::getUrl); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java index ee1617b1e..4863bdeb7 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java @@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; public class EnrichMissingPid extends UpdateMatcher { public EnrichMissingPid() { - super(true, + super(10, pid -> Topic.ENRICH_MISSING_PID, (p, pid) -> p.getPids().add(pid), pid -> pid.getType() + "::" + pid.getValue()); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java index 2c0533fa3..e7b65dad8 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java @@ -12,7 +12,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; public class EnrichMissingPublicationDate extends UpdateMatcher { public EnrichMissingPublicationDate() { - super(false, + super(1, date -> Topic.ENRICH_MISSING_PUBLICATION_DATE, (p, date) -> p.setPublicationdate(date), s -> s); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java index 9ab9fce48..f762e3f52 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java @@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; public class EnrichMissingSubject extends UpdateMatcher { public EnrichMissingSubject() { - super(true, + super(20, s -> Topic.fromPath("ENRICH/MISSING/SUBJECT/" + s.getType()), (p, s) -> p.getSubjects().add(s), s -> subjectAsString(s)); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java index e90a8f201..9ce362a97 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java @@ -14,7 +14,7 @@ import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; public class EnrichMoreOpenAccess extends UpdateMatcher { public EnrichMoreOpenAccess() { - super(true, + super(20, i -> Topic.ENRICH_MORE_OA_VERSION, (p, i) -> p.getInstances().add(i), OaBrokerInstance::getUrl); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java index 43b4f0628..583960037 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java @@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; public class EnrichMorePid extends UpdateMatcher { public EnrichMorePid() { - super(true, + super(20, pid -> Topic.ENRICH_MORE_PID, (p, pid) -> p.getPids().add(pid), pid -> pidAsString(pid)); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java index 97b289b69..150029462 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java @@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; public class EnrichMoreSubject extends UpdateMatcher { public EnrichMoreSubject() { - super(true, + super(20, s -> Topic.fromPath("ENRICH/MORE/SUBJECT/" + s.getType()), (p, s) -> p.getSubjects().add(s), s -> subjectAsString(s)); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java index 1a3f514e8..e142b5904 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java @@ -7,7 +7,16 @@ import java.util.List; import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.EventFactory; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; +import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMissingProject; import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAbstract; +import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAuthorOrcid; +import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingOpenAccess; +import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPid; +import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate; +import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingSubject; +import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreOpenAccess; +import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMorePid; +import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSubject; import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup; import eu.dnetlib.pace.config.DedupConfig; @@ -16,17 +25,17 @@ public class EventFinder { private static List> matchers = new ArrayList<>(); static { matchers.add(new EnrichMissingAbstract()); - // matchers.add(new EnrichMissingAuthorOrcid()); - // matchers.add(new EnrichMissingOpenAccess()); - // matchers.add(new EnrichMissingPid()); - // matchers.add(new EnrichMissingPublicationDate()); - // matchers.add(new EnrichMissingSubject()); - // matchers.add(new EnrichMoreOpenAccess()); - // matchers.add(new EnrichMorePid()); - // matchers.add(new EnrichMoreSubject()); + matchers.add(new EnrichMissingAuthorOrcid()); + matchers.add(new EnrichMissingOpenAccess()); + matchers.add(new EnrichMissingPid()); + matchers.add(new EnrichMissingPublicationDate()); + matchers.add(new EnrichMissingSubject()); + matchers.add(new EnrichMoreOpenAccess()); + matchers.add(new EnrichMorePid()); + matchers.add(new EnrichMoreSubject()); // // Advanced matchers - // matchers.add(new EnrichMissingProject()); + matchers.add(new EnrichMissingProject()); // matchers.add(new EnrichMoreProject()); // matchers.add(new EnrichMissingSoftware()); // matchers.add(new EnrichMoreSoftware());