diff --git a/dhp-workflows/dhp-broker-events/pom.xml b/dhp-workflows/dhp-broker-events/pom.xml index 424015a3c..119031b06 100644 --- a/dhp-workflows/dhp-broker-events/pom.xml +++ b/dhp-workflows/dhp-broker-events/pom.xml @@ -1,5 +1,7 @@ - + dhp-workflows eu.dnetlib.dhp @@ -24,7 +26,11 @@ org.apache.spark spark-sql_2.11 - + + org.elasticsearch + elasticsearch-hadoop + + eu.dnetlib.dhp @@ -53,7 +59,7 @@ eu.dnetlib dnet-openaire-broker-common - [3.0.3,4.0.0) + [3.0.4,4.0.0) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Event.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Event.java index f94d286e4..18950d98e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Event.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Event.java @@ -2,7 +2,6 @@ package eu.dnetlib.dhp.broker.model; import java.io.Serializable; -import java.util.Map; public class Event implements Serializable { @@ -25,7 +24,7 @@ public class Event implements Serializable { private boolean instantMessage; - private Map map; + private MappedFields map; public Event() { } @@ -33,7 +32,7 @@ public class Event implements Serializable { public Event(final String producerId, final String eventId, final String topic, final String payload, final Long creationDate, final Long expiryDate, final boolean instantMessage, - final Map map) { + final MappedFields map) { this.producerId = producerId; this.eventId = eventId; this.topic = topic; @@ -100,11 +99,11 @@ public class Event implements Serializable { this.instantMessage = instantMessage; } - public Map getMap() { + public MappedFields getMap() { return this.map; } - public void setMap(final Map map) { + public void setMap(final MappedFields map) { this.map = map; } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java index 6e38f7448..49e750698 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java @@ -3,9 +3,8 @@ package eu.dnetlib.dhp.broker.model; import java.text.ParseException; import java.util.Date; -import java.util.HashMap; import java.util.List; -import java.util.Map; +import java.util.stream.Collectors; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; @@ -30,10 +29,10 @@ public class EventFactory { final Event res = new Event(); - final Map map = createMapFromResult(updateInfo); + final MappedFields map = createMapFromResult(updateInfo); final String eventId = calculateEventId( - updateInfo.getTopicPath(), updateInfo.getTarget().getOriginalId(), updateInfo.getHighlightValueAsString()); + updateInfo.getTopicPath(), updateInfo.getTarget().getOpenaireId(), updateInfo.getHighlightValueAsString()); res.setEventId(eventId); res.setProducerId(PRODUCER_ID); @@ -43,38 +42,41 @@ public class EventFactory { res.setCreationDate(now); res.setExpiryDate(calculateExpiryDate(now)); res.setInstantMessage(false); + return res; } - private static Map createMapFromResult(final UpdateInfo updateInfo) { - final Map map = new HashMap<>(); + private static MappedFields createMapFromResult(final UpdateInfo updateInfo) { + final MappedFields map = new MappedFields(); final OaBrokerMainEntity source = updateInfo.getSource(); final OaBrokerMainEntity target = updateInfo.getTarget(); - map.put("target_datasource_id", target.getCollectedFromId()); - map.put("target_datasource_name", target.getCollectedFromName()); + map.setTargetDatasourceId(target.getCollectedFromId()); + map.setTargetDatasourceName(target.getCollectedFromName()); + map.setTargetDatasourceType(target.getCollectedFromType()); - map.put("target_publication_id", target.getOriginalId()); + map.setTargetResultId(target.getOpenaireId()); final List titles = target.getTitles(); if (titles.size() > 0) { - map.put("target_publication_title", titles.get(0)); + map.setTargetResultTitle(titles.get(0)); } final long date = parseDateTolong(target.getPublicationdate()); if (date > 0) { - map.put("target_dateofacceptance", date); + map.setTargetDateofacceptance(date); } - map.put("target_publication_subject_list", target.getSubjects()); - map.put("target_publication_author_list", target.getCreators()); + map.setTargetSubjects(target.getSubjects().stream().map(s -> s.getValue()).collect(Collectors.toList())); + map.setTargetAuthors(target.getCreators().stream().map(a -> a.getFullname()).collect(Collectors.toList())); // PROVENANCE INFO - map.put("trust", updateInfo.getTrust()); - map.put("provenance_datasource_id", source.getCollectedFromId()); - map.put("provenance_datasource_name", source.getCollectedFromName()); - map.put("provenance_publication_id_list", source.getOriginalId()); + map.setTrust(updateInfo.getTrust()); + map.setProvenanceDatasourceId(source.getCollectedFromId()); + map.setProvenanceDatasourceName(source.getCollectedFromName()); + map.setProvenanceDatasourceType(source.getCollectedFromType()); + map.setProvenanceResultId(source.getOpenaireId()); return map; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/MappedFields.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/MappedFields.java new file mode 100644 index 000000000..2c1be3ba4 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/MappedFields.java @@ -0,0 +1,136 @@ + +package eu.dnetlib.dhp.broker.model; + +import java.io.Serializable; +import java.util.List; + +public class MappedFields implements Serializable { + + /** + * + */ + private static final long serialVersionUID = -7999704113195802008L; + + private String targetDatasourceId; + private String targetDatasourceName; + private String targetDatasourceType; + private String targetResultId; + private String targetResultTitle; + private long targetDateofacceptance; + private List targetSubjects; + private List targetAuthors; + private float trust; + private String provenanceDatasourceId; + private String provenanceDatasourceName; + private String provenanceDatasourceType; + private String provenanceResultId; + + public String getTargetDatasourceId() { + return targetDatasourceId; + } + + public void setTargetDatasourceId(final String targetDatasourceId) { + this.targetDatasourceId = targetDatasourceId; + } + + public String getTargetDatasourceName() { + return targetDatasourceName; + } + + public void setTargetDatasourceName(final String targetDatasourceName) { + this.targetDatasourceName = targetDatasourceName; + } + + public String getTargetDatasourceType() { + return targetDatasourceType; + } + + public void setTargetDatasourceType(final String targetDatasourceType) { + this.targetDatasourceType = targetDatasourceType; + } + + public String getTargetResultId() { + return targetResultId; + } + + public void setTargetResultId(final String targetResultId) { + this.targetResultId = targetResultId; + } + + public String getTargetResultTitle() { + return targetResultTitle; + } + + public void setTargetResultTitle(final String targetResultTitle) { + this.targetResultTitle = targetResultTitle; + } + + public long getTargetDateofacceptance() { + return targetDateofacceptance; + } + + public void setTargetDateofacceptance(final long targetDateofacceptance) { + this.targetDateofacceptance = targetDateofacceptance; + } + + public List getTargetSubjects() { + return targetSubjects; + } + + public void setTargetSubjects(final List targetSubjects) { + this.targetSubjects = targetSubjects; + } + + public List getTargetAuthors() { + return targetAuthors; + } + + public void setTargetAuthors(final List targetAuthors) { + this.targetAuthors = targetAuthors; + } + + public float getTrust() { + return trust; + } + + public void setTrust(final float trust) { + this.trust = trust; + } + + public String getProvenanceDatasourceId() { + return provenanceDatasourceId; + } + + public void setProvenanceDatasourceId(final String provenanceDatasourceId) { + this.provenanceDatasourceId = provenanceDatasourceId; + } + + public String getProvenanceDatasourceName() { + return provenanceDatasourceName; + } + + public void setProvenanceDatasourceName(final String provenanceDatasourceName) { + this.provenanceDatasourceName = provenanceDatasourceName; + } + + public String getProvenanceDatasourceType() { + return provenanceDatasourceType; + } + + public void setProvenanceDatasourceType(final String provenanceDatasourceType) { + this.provenanceDatasourceType = provenanceDatasourceType; + } + + public String getProvenanceResultId() { + return provenanceResultId; + } + + public void setProvenanceResultId(final String provenanceResultId) { + this.provenanceResultId = provenanceResultId; + } + + public static long getSerialversionuid() { + return serialVersionUID; + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java index 3ea0086ff..5d3121aed 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java @@ -3,14 +3,20 @@ package eu.dnetlib.dhp.broker.oa; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Map; import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.SparkContext; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; +import org.apache.spark.util.LongAccumulator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -18,6 +24,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.broker.model.Event; +import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; import eu.dnetlib.dhp.broker.oa.util.EventFinder; import eu.dnetlib.dhp.broker.oa.util.EventGroup; @@ -44,9 +51,6 @@ public class GenerateEventsJob { .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String graphPath = parser.get("graphPath"); - log.info("graphPath: {}", graphPath); - final String workingPath = parser.get("workingPath"); log.info("workingPath: {}", workingPath); @@ -56,9 +60,18 @@ public class GenerateEventsJob { final String dedupConfigProfileId = parser.get("dedupConfProfile"); log.info("dedupConfigProfileId: {}", dedupConfigProfileId); - final String eventsPath = workingPath + "/eventsPath"; + final String eventsPath = workingPath + "/events"; log.info("eventsPath: {}", eventsPath); + final Set dsIdWhitelist = parseParamAsList(parser, "datasourceIdWhitelist"); + log.info("datasourceIdWhitelist: {}", StringUtils.join(dsIdWhitelist, ",")); + + final Set dsTypeWhitelist = parseParamAsList(parser, "datasourceTypeWhitelist"); + log.info("datasourceTypeWhitelist: {}", StringUtils.join(dsTypeWhitelist, ",")); + + final Set dsIdBlacklist = parseParamAsList(parser, "datasourceIdBlacklist"); + log.info("datasourceIdBlacklist: {}", StringUtils.join(dsIdBlacklist, ",")); + final SparkConf conf = new SparkConf(); // TODO UNCOMMENT @@ -69,21 +82,54 @@ public class GenerateEventsJob { ClusterUtils.removeDir(spark, eventsPath); + final Map accumulators = prepareAccumulators(spark.sparkContext()); + + final LongAccumulator total = spark.sparkContext().longAccumulator("total_events"); + final Dataset groups = ClusterUtils - .readPath(spark, graphPath + "/relation", ResultGroup.class); + .readPath(spark, workingPath + "/duplicates", ResultGroup.class); - final Dataset events = groups + final Dataset dataset = groups .map( - (MapFunction) g -> EventFinder.generateEvents(g, dedupConfig), - Encoders.bean(EventGroup.class)) - .flatMap(group -> group.getData().iterator(), Encoders.bean(Event.class)); + g -> EventFinder + .generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, dedupConfig, accumulators), + Encoders + .bean(EventGroup.class)) + .flatMap(g -> g.getData().iterator(), Encoders.bean(Event.class)); - events.write().mode(SaveMode.Overwrite).json(eventsPath); + ClusterUtils.save(dataset, eventsPath, Event.class, total); }); } + private static Set parseParamAsList(final ArgumentApplicationParser parser, final String key) { + final String s = parser.get(key).trim(); + + final Set res = new HashSet<>(); + + if (s.length() > 1) { // A value of a single char (for example: '-') indicates an empty list + Arrays + .stream(s.split(",")) + .map(String::trim) + .filter(StringUtils::isNotBlank) + .forEach(res::add); + } + + return res; + } + + public static Map prepareAccumulators(final SparkContext sc) { + + return EventFinder + .getMatchers() + .stream() + .map(UpdateMatcher::accumulatorName) + .distinct() + .collect(Collectors.toMap(s -> s, s -> sc.longAccumulator(s))); + + } + private static DedupConfig loadDedupConfig(final String isLookupUrl, final String profId) throws Exception { final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexOnESJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexOnESJob.java new file mode 100644 index 000000000..9124d18e3 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexOnESJob.java @@ -0,0 +1,71 @@ + +package eu.dnetlib.dhp.broker.oa; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.elasticsearch.spark.rdd.api.java.JavaEsSpark; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.model.Event; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; + +public class IndexOnESJob { + + private static final Logger log = LoggerFactory.getLogger(IndexOnESJob.class); + + public static void main(final String[] args) throws Exception { + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + IndexOnESJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/index_es.json"))); + parser.parseArgument(args); + + final SparkConf conf = new SparkConf(); + + final String eventsPath = parser.get("workingPath") + "/events"; + log.info("eventsPath: {}", eventsPath); + + final String index = parser.get("index"); + log.info("index: {}", index); + + final String indexHost = parser.get("esHost"); + log.info("indexHost: {}", indexHost); + + final SparkSession spark = SparkSession.builder().config(conf).getOrCreate(); + + final JavaRDD inputRdd = ClusterUtils + .readPath(spark, eventsPath, Event.class) + .limit(10000) // TODO REMOVE + .map(IndexOnESJob::eventAsJsonString, Encoders.STRING()) + .javaRDD(); + + final Map esCfg = new HashMap<>(); + // esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54"); + esCfg.put("es.nodes", indexHost); + esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY + esCfg.put("es.batch.write.retry.count", "8"); + esCfg.put("es.batch.write.retry.wait", "60s"); + esCfg.put("es.batch.size.entries", "200"); + esCfg.put("es.nodes.wan.only", "true"); + + JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg); + } + + private static String eventAsJsonString(final Event f) throws JsonProcessingException { + return new ObjectMapper().writeValueAsString(f); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinEntitiesJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinEntitiesJob.java deleted file mode 100644 index dac308f36..000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinEntitiesJob.java +++ /dev/null @@ -1,94 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - -import java.util.Optional; - -import org.apache.commons.io.IOUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.TypedColumn; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import eu.dnetlib.broker.objects.OaBrokerMainEntity; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.OaBrokerMainEntityAggregator; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDataset; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublication; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware; -import scala.Tuple2; - -public class JoinEntitiesJob { - - private static final Logger log = LoggerFactory.getLogger(JoinEntitiesJob.class); - - public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - JoinEntitiesJob.class - .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); - parser.parseArgument(args); - - final Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - - final String graphPath = parser.get("graphPath"); - log.info("graphPath: {}", graphPath); - - final String workingPath = parser.get("workingPath"); - log.info("workingPath: {}", workingPath); - - final String joinedEntitiesPath = workingPath + "/joinedEntities"; - log.info("joinedEntitiesPath: {}", joinedEntitiesPath); - - final SparkConf conf = new SparkConf(); - - runWithSparkSession(conf, isSparkSessionManaged, spark -> { - - ClusterUtils.removeDir(spark, joinedEntitiesPath); - - final Dataset r0 = ClusterUtils - .readPath(spark, graphPath + "/simpleEntities", OaBrokerMainEntity.class); - - final Dataset r1 = join( - r0, ClusterUtils.readPath(spark, graphPath + "/relatedProjects", RelatedProject.class)); - final Dataset r2 = join( - r1, ClusterUtils.readPath(spark, graphPath + "/relatedDatasets", RelatedDataset.class)); - final Dataset r3 = join( - r2, ClusterUtils.readPath(spark, graphPath + "/relatedPublications", RelatedPublication.class)); - final Dataset r4 = join( - r3, ClusterUtils.readPath(spark, graphPath + "/relatedSoftwares", RelatedSoftware.class)); - - r4.write().mode(SaveMode.Overwrite).json(joinedEntitiesPath); - - }); - - } - - private static Dataset join(final Dataset sources, - final Dataset typedRels) { - - final TypedColumn, OaBrokerMainEntity> aggr = new OaBrokerMainEntityAggregator() - .toColumn(); - - return sources - .joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer") - .groupByKey( - (MapFunction, String>) t -> t._1.getOpenaireId(), Encoders.STRING()) - .agg(aggr) - .map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class)); - - } - -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep0Job.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep0Job.java new file mode 100644 index 000000000..eb1825fa5 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep0Job.java @@ -0,0 +1,80 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.TypedColumn; +import org.apache.spark.util.LongAccumulator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.AddDatasourceTypeAggregator; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.SimpleDatasourceInfo; +import scala.Tuple2; + +public class JoinStep0Job { + + private static final Logger log = LoggerFactory.getLogger(JoinStep0Job.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + JoinStep0Job.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String graphPath = parser.get("graphPath"); + log.info("graphPath: {}", graphPath); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String outputPath = workingPath + "/joinedEntities_step0"; + log.info("outputPath: {}", outputPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, outputPath); + + final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities"); + + final Dataset sources = ClusterUtils + .readPath(spark, workingPath + "/simpleEntities", OaBrokerMainEntity.class); + + final Dataset datasources = ClusterUtils + .readPath(spark, workingPath + "/datasources", SimpleDatasourceInfo.class); + + final TypedColumn, OaBrokerMainEntity> aggr = new AddDatasourceTypeAggregator() + .toColumn(); + + final Dataset dataset = sources + .joinWith(datasources, sources.col("collectedFromId").equalTo(datasources.col("id")), "inner") + .groupByKey(t -> t._1.getOpenaireId(), Encoders.STRING()) + .agg(aggr) + .map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class)); + + ClusterUtils.save(dataset, outputPath, OaBrokerMainEntity.class, total); + + }); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep1Job.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep1Job.java new file mode 100644 index 000000000..8e502f736 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep1Job.java @@ -0,0 +1,80 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.TypedColumn; +import org.apache.spark.util.LongAccumulator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProjectAggregator; +import scala.Tuple2; + +public class JoinStep1Job { + + private static final Logger log = LoggerFactory.getLogger(JoinStep1Job.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + JoinStep1Job.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String joinedEntitiesPath = workingPath + "/joinedEntities_step1"; + log.info("joinedEntitiesPath: {}", joinedEntitiesPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, joinedEntitiesPath); + + final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities"); + + final Dataset sources = ClusterUtils + .readPath(spark, workingPath + "/joinedEntities_step0", OaBrokerMainEntity.class); + + final Dataset typedRels = ClusterUtils + .readPath(spark, workingPath + "/relatedProjects", RelatedProject.class); + + final TypedColumn, OaBrokerMainEntity> aggr = new RelatedProjectAggregator() + .toColumn(); + + final Dataset dataset = sources + .joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer") + .groupByKey( + (MapFunction, String>) t -> t._1.getOpenaireId(), + Encoders.STRING()) + .agg(aggr) + .map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class)); + + ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total); + + }); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep2Job.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep2Job.java new file mode 100644 index 000000000..cdcf0add4 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep2Job.java @@ -0,0 +1,80 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.TypedColumn; +import org.apache.spark.util.LongAccumulator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftwareAggregator; +import scala.Tuple2; + +public class JoinStep2Job { + + private static final Logger log = LoggerFactory.getLogger(JoinStep2Job.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + JoinStep2Job.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String joinedEntitiesPath = workingPath + "/joinedEntities_step2"; + log.info("joinedEntitiesPath: {}", joinedEntitiesPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, joinedEntitiesPath); + + final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities"); + + final Dataset sources = ClusterUtils + .readPath(spark, workingPath + "/joinedEntities_step1", OaBrokerMainEntity.class); + + final Dataset typedRels = ClusterUtils + .readPath(spark, workingPath + "/relatedSoftwares", RelatedSoftware.class); + + final TypedColumn, OaBrokerMainEntity> aggr = new RelatedSoftwareAggregator() + .toColumn(); + + final Dataset dataset = sources + .joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer") + .groupByKey( + (MapFunction, String>) t -> t._1.getOpenaireId(), + Encoders.STRING()) + .agg(aggr) + .map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class)); + + ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total); + + }); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep3Job.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep3Job.java new file mode 100644 index 000000000..4d06f6f13 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep3Job.java @@ -0,0 +1,80 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.TypedColumn; +import org.apache.spark.util.LongAccumulator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDataset; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDatasetAggregator; +import scala.Tuple2; + +public class JoinStep3Job { + + private static final Logger log = LoggerFactory.getLogger(JoinStep3Job.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + JoinStep3Job.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String joinedEntitiesPath = workingPath + "/joinedEntities_step3"; + log.info("joinedEntitiesPath: {}", joinedEntitiesPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, joinedEntitiesPath); + + final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities"); + + final Dataset sources = ClusterUtils + .readPath(spark, workingPath + "/joinedEntities_step2", OaBrokerMainEntity.class); + + final Dataset typedRels = ClusterUtils + .readPath(spark, workingPath + "/relatedDatasets", RelatedDataset.class); + + final TypedColumn, OaBrokerMainEntity> aggr = new RelatedDatasetAggregator() + .toColumn(); + + final Dataset dataset = sources + .joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer") + .groupByKey( + (MapFunction, String>) t -> t._1.getOpenaireId(), + Encoders.STRING()) + .agg(aggr) + .map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class)); + + ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total); + + }); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep4Job.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep4Job.java new file mode 100644 index 000000000..b53d7e39b --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep4Job.java @@ -0,0 +1,80 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.TypedColumn; +import org.apache.spark.util.LongAccumulator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublication; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublicationAggregator; +import scala.Tuple2; + +public class JoinStep4Job { + + private static final Logger log = LoggerFactory.getLogger(JoinStep4Job.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + JoinStep4Job.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String joinedEntitiesPath = workingPath + "/joinedEntities_step4"; + log.info("joinedEntitiesPath: {}", joinedEntitiesPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, joinedEntitiesPath); + + final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities"); + + final Dataset sources = ClusterUtils + .readPath(spark, workingPath + "/joinedEntities_step3", OaBrokerMainEntity.class); + + final Dataset typedRels = ClusterUtils + .readPath(spark, workingPath + "/relatedPublications", RelatedPublication.class); + + final TypedColumn, OaBrokerMainEntity> aggr = new RelatedPublicationAggregator() + .toColumn(); + + final Dataset dataset = sources + .joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer") + .groupByKey( + (MapFunction, String>) t -> t._1.getOpenaireId(), + Encoders.STRING()) + .agg(aggr) + .map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class)); + + ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total); + + }); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java index aa057eee8..eb9add00d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java @@ -10,8 +10,8 @@ import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.TypedColumn; +import org.apache.spark.util.LongAccumulator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -32,7 +32,7 @@ public class PrepareGroupsJob { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils .toString( - JoinEntitiesJob.class + PrepareGroupsJob.class .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); parser.parseArgument(args); @@ -48,7 +48,7 @@ public class PrepareGroupsJob { final String workingPath = parser.get("workingPath"); log.info("workingPath: {}", workingPath); - final String groupsPath = workingPath + "/groups"; + final String groupsPath = workingPath + "/duplicates"; log.info("groupsPath: {}", groupsPath); final SparkConf conf = new SparkConf(); @@ -57,8 +57,10 @@ public class PrepareGroupsJob { ClusterUtils.removeDir(spark, groupsPath); + final LongAccumulator total = spark.sparkContext().longAccumulator("total_groups"); + final Dataset results = ClusterUtils - .readPath(spark, graphPath + "/joinedEntities", OaBrokerMainEntity.class); + .readPath(spark, workingPath + "/joinedEntities_step4", OaBrokerMainEntity.class); final Dataset mergedRels = ClusterUtils .readPath(spark, graphPath + "/relation", Relation.class) @@ -67,20 +69,16 @@ public class PrepareGroupsJob { final TypedColumn, ResultGroup> aggr = new ResultAggregator() .toColumn(); - final Dataset groups = results + final Dataset dataset = results .joinWith(mergedRels, results.col("openaireId").equalTo(mergedRels.col("source")), "inner") .groupByKey( (MapFunction, String>) t -> t._2.getTarget(), Encoders.STRING()) .agg(aggr) - .map( - (MapFunction, ResultGroup>) t -> t._2, Encoders.bean(ResultGroup.class)) + .map(t -> t._2, Encoders.bean(ResultGroup.class)) .filter(rg -> rg.getData().size() > 1); - groups - .write() - .mode(SaveMode.Overwrite) - .json(groupsPath); + ClusterUtils.save(dataset, groupsPath, ResultGroup.class, total); }); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java index edf9b9a7e..0cfc1adcb 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java @@ -9,14 +9,16 @@ import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; +import org.apache.spark.util.LongAccumulator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.broker.objects.OaBrokerRelatedDataset; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDataset; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Relation; public class PrepareRelatedDatasetsJob { @@ -52,22 +54,30 @@ public class PrepareRelatedDatasetsJob { ClusterUtils.removeDir(spark, relsPath); - final Dataset datasets = ClusterUtils - .readPath(spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class); + final LongAccumulator total = spark.sparkContext().longAccumulator("total_rels"); - final Dataset rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class); + final Dataset datasets = ClusterUtils + .readPath(spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class) + .filter(d -> !ClusterUtils.isDedupRoot(d.getId())) + .map(ConversionUtils::oafDatasetToBrokerDataset, Encoders.bean(OaBrokerRelatedDataset.class)); - rels - .joinWith(datasets, datasets.col("id").equalTo(rels.col("target")), "inner") - .map( - t -> new RelatedDataset( - t._1.getSource(), - t._1.getRelType(), - ConversionUtils.oafDatasetToBrokerDataset(t._2)), - Encoders.bean(RelatedDataset.class)) - .write() - .mode(SaveMode.Overwrite) - .json(relsPath); + final Dataset rels = ClusterUtils + .readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> r.getDataInfo().getDeletedbyinference()) + .filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT)) + .filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass())) + .filter(r -> !ClusterUtils.isDedupRoot(r.getSource())) + .filter(r -> !ClusterUtils.isDedupRoot(r.getTarget())); + + final Dataset dataset = rels + .joinWith(datasets, datasets.col("openaireId").equalTo(rels.col("target")), "inner") + .map(t -> { + final RelatedDataset rel = new RelatedDataset(t._1.getSource(), t._2); + rel.getRelDataset().setRelType(t._1.getRelClass()); + return rel; + }, Encoders.bean(RelatedDataset.class)); + + ClusterUtils.save(dataset, relsPath, RelatedDataset.class, total); }); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasourcesJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasourcesJob.java new file mode 100644 index 000000000..30f5ddac3 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasourcesJob.java @@ -0,0 +1,68 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.util.LongAccumulator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.SimpleDatasourceInfo; +import eu.dnetlib.dhp.schema.oaf.Datasource; + +public class PrepareRelatedDatasourcesJob { + + private static final Logger log = LoggerFactory.getLogger(PrepareRelatedDatasourcesJob.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + PrepareRelatedDatasourcesJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String graphPath = parser.get("graphPath"); + log.info("graphPath: {}", graphPath); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String relsPath = workingPath + "/datasources"; + log.info("relsPath: {}", relsPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, relsPath); + + final LongAccumulator total = spark.sparkContext().longAccumulator("total_datasources"); + + final Dataset dataset = ClusterUtils + .readPath(spark, graphPath + "/datasource", Datasource.class) + .map( + ds -> new SimpleDatasourceInfo(ds.getId(), ds.getDatasourcetype().getClassid()), + Encoders.bean(SimpleDatasourceInfo.class)); + + ClusterUtils.save(dataset, relsPath, SimpleDatasourceInfo.class, total); + + }); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java index 00957972a..e988366c8 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java @@ -9,13 +9,13 @@ import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; +import org.apache.spark.util.LongAccumulator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.databind.ObjectMapper; - +import eu.dnetlib.broker.objects.OaBrokerProject; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject; @@ -27,8 +27,6 @@ public class PrepareRelatedProjectsJob { private static final Logger log = LoggerFactory.getLogger(PrepareRelatedProjectsJob.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils @@ -58,23 +56,27 @@ public class PrepareRelatedProjectsJob { ClusterUtils.removeDir(spark, relsPath); - final Dataset projects = ClusterUtils.readPath(spark, graphPath + "/project", Project.class); + final LongAccumulator total = spark.sparkContext().longAccumulator("total_rels"); + + final Dataset projects = ClusterUtils + .readPath(spark, graphPath + "/project", Project.class) + .filter(p -> !ClusterUtils.isDedupRoot(p.getId())) + .map(ConversionUtils::oafProjectToBrokerProject, Encoders.bean(OaBrokerProject.class)); final Dataset rels = ClusterUtils .readPath(spark, graphPath + "/relation", Relation.class) - .filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT)); + .filter(r -> r.getDataInfo().getDeletedbyinference()) + .filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT)) + .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) + .filter(r -> !ClusterUtils.isDedupRoot(r.getSource())) + .filter(r -> !ClusterUtils.isDedupRoot(r.getTarget())); + + final Dataset dataset = rels + .joinWith(projects, projects.col("openaireId").equalTo(rels.col("target")), "inner") + .map(t -> new RelatedProject(t._1.getSource(), t._2), Encoders.bean(RelatedProject.class)); + + ClusterUtils.save(dataset, relsPath, RelatedProject.class, total); - rels - .joinWith(projects, projects.col("id").equalTo(rels.col("target")), "inner") - .map( - t -> new RelatedProject( - t._1.getSource(), - t._1.getRelType(), - ConversionUtils.oafProjectToBrokerProject(t._2)), - Encoders.bean(RelatedProject.class)) - .write() - .mode(SaveMode.Overwrite) - .json(relsPath); }); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java index 945fd9ed7..724acc4dc 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java @@ -9,16 +9,16 @@ import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; +import org.apache.spark.util.LongAccumulator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.databind.ObjectMapper; - +import eu.dnetlib.broker.objects.OaBrokerRelatedPublication; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublication; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Relation; @@ -26,8 +26,6 @@ public class PrepareRelatedPublicationsJob { private static final Logger log = LoggerFactory.getLogger(PrepareRelatedPublicationsJob.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils @@ -57,22 +55,32 @@ public class PrepareRelatedPublicationsJob { ClusterUtils.removeDir(spark, relsPath); - final Dataset pubs = ClusterUtils - .readPath(spark, graphPath + "/publication", Publication.class); + final LongAccumulator total = spark.sparkContext().longAccumulator("total_rels"); - final Dataset rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class); - - rels - .joinWith(pubs, pubs.col("id").equalTo(rels.col("target")), "inner") + final Dataset pubs = ClusterUtils + .readPath(spark, graphPath + "/publication", Publication.class) + .filter(p -> !ClusterUtils.isDedupRoot(p.getId())) .map( - t -> new RelatedPublication( - t._1.getSource(), - t._1.getRelType(), - ConversionUtils.oafPublicationToBrokerPublication(t._2)), - Encoders.bean(RelatedPublication.class)) - .write() - .mode(SaveMode.Overwrite) - .json(relsPath); + ConversionUtils::oafPublicationToBrokerPublication, + Encoders.bean(OaBrokerRelatedPublication.class)); + + final Dataset rels = ClusterUtils + .readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> r.getDataInfo().getDeletedbyinference()) + .filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT)) + .filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass())) + .filter(r -> !ClusterUtils.isDedupRoot(r.getSource())) + .filter(r -> !ClusterUtils.isDedupRoot(r.getTarget())); + + final Dataset dataset = rels + .joinWith(pubs, pubs.col("openaireId").equalTo(rels.col("target")), "inner") + .map(t -> { + final RelatedPublication rel = new RelatedPublication(t._1.getSource(), t._2); + rel.getRelPublication().setRelType(t._1.getRelClass()); + return rel; + }, Encoders.bean(RelatedPublication.class)); + + ClusterUtils.save(dataset, relsPath, RelatedPublication.class, total); }); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java index edb8dc1c3..d15565d0d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java @@ -9,16 +9,17 @@ import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; +import org.apache.spark.util.LongAccumulator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.databind.ObjectMapper; - +import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Software; @@ -26,8 +27,6 @@ public class PrepareRelatedSoftwaresJob { private static final Logger log = LoggerFactory.getLogger(PrepareRelatedSoftwaresJob.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils @@ -57,21 +56,26 @@ public class PrepareRelatedSoftwaresJob { ClusterUtils.removeDir(spark, relsPath); - final Dataset softwares = ClusterUtils.readPath(spark, graphPath + "/software", Software.class); + final LongAccumulator total = spark.sparkContext().longAccumulator("total_rels"); - final Dataset rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class); + final Dataset softwares = ClusterUtils + .readPath(spark, graphPath + "/software", Software.class) + .filter(sw -> !ClusterUtils.isDedupRoot(sw.getId())) + .map(ConversionUtils::oafSoftwareToBrokerSoftware, Encoders.bean(OaBrokerRelatedSoftware.class)); - rels - .joinWith(softwares, softwares.col("id").equalTo(rels.col("target")), "inner") - .map( - t -> new RelatedSoftware( - t._1.getSource(), - t._1.getRelType(), - ConversionUtils.oafSoftwareToBrokerSoftware(t._2)), - Encoders.bean(RelatedSoftware.class)) - .write() - .mode(SaveMode.Overwrite) - .json(relsPath); + final Dataset rels = ClusterUtils + .readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> r.getDataInfo().getDeletedbyinference()) + .filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT)) + .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) + .filter(r -> !ClusterUtils.isDedupRoot(r.getSource())) + .filter(r -> !ClusterUtils.isDedupRoot(r.getTarget())); + + final Dataset dataset = rels + .joinWith(softwares, softwares.col("openaireId").equalTo(rels.col("target")), "inner") + .map(t -> new RelatedSoftware(t._1.getSource(), t._2), Encoders.bean(RelatedSoftware.class)); + + ClusterUtils.save(dataset, relsPath, RelatedSoftware.class, total); }); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java index 213003db2..d3c7113ec 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java @@ -9,8 +9,8 @@ import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; +import org.apache.spark.util.LongAccumulator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -56,13 +56,14 @@ public class PrepareSimpleEntititiesJob { ClusterUtils.removeDir(spark, simpleEntitiesPath); - prepareSimpleEntities(spark, graphPath, Publication.class) + final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities"); + + final Dataset dataset = prepareSimpleEntities(spark, graphPath, Publication.class) .union(prepareSimpleEntities(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class)) .union(prepareSimpleEntities(spark, graphPath, Software.class)) - .union(prepareSimpleEntities(spark, graphPath, OtherResearchProduct.class)) - .write() - .mode(SaveMode.Overwrite) - .json(simpleEntitiesPath); + .union(prepareSimpleEntities(spark, graphPath, OtherResearchProduct.class)); + + ClusterUtils.save(dataset, simpleEntitiesPath, OaBrokerMainEntity.class, total); }); } @@ -74,6 +75,7 @@ public class PrepareSimpleEntititiesJob { return ClusterUtils .readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass) + .filter(r -> !ClusterUtils.isDedupRoot(r.getId())) .filter(r -> r.getDataInfo().getDeletedbyinference()) .map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OaBrokerMainEntity.class)); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java index 9aa6f5384..af6ab30a1 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java @@ -1,16 +1,18 @@ package eu.dnetlib.dhp.broker.oa.matchers; -import java.util.Arrays; +import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.function.BiConsumer; import java.util.function.Function; +import java.util.stream.Collectors; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; +import org.apache.spark.util.LongAccumulator; import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.Topic; @@ -19,15 +21,15 @@ import eu.dnetlib.pace.config.DedupConfig; public abstract class UpdateMatcher { - private final boolean multipleUpdate; + private final int maxNumber; private final Function topicFunction; private final BiConsumer compileHighlightFunction; private final Function highlightToStringFunction; - public UpdateMatcher(final boolean multipleUpdate, final Function topicFunction, + public UpdateMatcher(final int maxNumber, final Function topicFunction, final BiConsumer compileHighlightFunction, final Function highlightToStringFunction) { - this.multipleUpdate = multipleUpdate; + this.maxNumber = maxNumber; this.topicFunction = topicFunction; this.compileHighlightFunction = compileHighlightFunction; this.highlightToStringFunction = highlightToStringFunction; @@ -35,7 +37,8 @@ public abstract class UpdateMatcher { public Collection> searchUpdatesForRecord(final OaBrokerMainEntity res, final Collection others, - final DedupConfig dedupConfig) { + final DedupConfig dedupConfig, + final Map accumulators) { final Map> infoMap = new HashMap<>(); @@ -43,28 +46,34 @@ public abstract class UpdateMatcher { if (source != res) { for (final T hl : findDifferences(source, res)) { final Topic topic = getTopicFunction().apply(hl); - final UpdateInfo info = new UpdateInfo<>(topic, hl, source, res, getCompileHighlightFunction(), - getHighlightToStringFunction(), dedupConfig); - final String s = DigestUtils.md5Hex(info.getHighlightValueAsString()); - if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) { - } else { - infoMap.put(s, info); + if (topic != null) { + final UpdateInfo info = new UpdateInfo<>(topic, hl, source, res, + getCompileHighlightFunction(), + getHighlightToStringFunction(), dedupConfig); + + final String s = DigestUtils.md5Hex(info.getHighlightValueAsString()); + if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) { + infoMap.put(s, info); + } } } } } - final Collection> values = infoMap.values(); + final List> values = infoMap + .values() + .stream() + .sorted((o1, o2) -> Float.compare(o2.getTrust(), o1.getTrust())) // DESCENDING + .collect(Collectors.toList()); - if (values.isEmpty() || multipleUpdate) { - return values; + if (values.isEmpty()) { + return new ArrayList<>(); + } else if (values.size() > maxNumber) { + incrementAccumulator(accumulators, maxNumber); + return values.subList(0, maxNumber); } else { - final UpdateInfo v = values - .stream() - .sorted((o1, o2) -> Float.compare(o1.getTrust(), o2.getTrust())) - .findFirst() - .get(); - return Arrays.asList(v); + incrementAccumulator(accumulators, values.size()); + return values; } } @@ -74,12 +83,12 @@ public abstract class UpdateMatcher { return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0)); } - protected boolean isMissing(final String field) { - return StringUtils.isBlank(field); + protected boolean isMissing(final String s) { + return StringUtils.isBlank(s); } - public boolean isMultipleUpdate() { - return multipleUpdate; + public int getMaxNumber() { + return maxNumber; } public Function getTopicFunction() { @@ -94,4 +103,14 @@ public abstract class UpdateMatcher { return highlightToStringFunction; } + public String accumulatorName() { + return "event_matcher_" + getClass().getSimpleName().toLowerCase(); + } + + public void incrementAccumulator(final Map accumulators, final long n) { + if (accumulators != null && accumulators.containsKey(accumulatorName())) { + accumulators.get(accumulatorName()).add(n); + } + } + } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java index c197734a3..2f73a2448 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets; +import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -9,14 +10,15 @@ import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.broker.objects.OaBrokerRelatedDataset; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; public abstract class AbstractEnrichMissingDataset extends UpdateMatcher { public AbstractEnrichMissingDataset(final Topic topic) { - super(true, + super(10, rel -> topic, (p, rel) -> p.getDatasets().add(rel), - rel -> rel.getOriginalId()); + rel -> rel.getOpenaireId()); } protected abstract boolean filterByType(String relType); @@ -25,18 +27,22 @@ public abstract class AbstractEnrichMissingDataset extends UpdateMatcher findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) { + if (target.getDatasets().size() >= BrokerConstants.MAX_LIST_SIZE) { + return new ArrayList<>(); + } + final Set existingDatasets = target .getDatasets() .stream() .filter(rel -> filterByType(rel.getRelType())) - .map(OaBrokerRelatedDataset::getOriginalId) + .map(OaBrokerRelatedDataset::getOpenaireId) .collect(Collectors.toSet()); return source .getDatasets() .stream() .filter(rel -> filterByType(rel.getRelType())) - .filter(d -> !existingDatasets.contains(d.getOriginalId())) + .filter(d -> !existingDatasets.contains(d.getOpenaireId())) .collect(Collectors.toList()); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java index 49c546bba..4b563d381 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java @@ -12,7 +12,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; public class EnrichMissingProject extends UpdateMatcher { public EnrichMissingProject() { - super(true, + super(20, prj -> Topic.ENRICH_MISSING_PROJECT, (p, prj) -> p.getProjects().add(prj), prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode()); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java index 6954a3fb5..6a10f19be 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedProjects; +import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -9,11 +10,12 @@ import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.broker.objects.OaBrokerProject; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; public class EnrichMoreProject extends UpdateMatcher { public EnrichMoreProject() { - super(true, + super(20, prj -> Topic.ENRICH_MORE_PROJECT, (p, prj) -> p.getProjects().add(prj), prj -> projectAsString(prj)); @@ -27,6 +29,10 @@ public class EnrichMoreProject extends UpdateMatcher { protected List findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) { + if (target.getProjects().size() >= BrokerConstants.MAX_LIST_SIZE) { + return new ArrayList<>(); + } + final Set existingProjects = target .getProjects() .stream() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java index ad6d8263b..7ba3e5e02 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications; +import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -9,14 +10,15 @@ import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.broker.objects.OaBrokerRelatedPublication; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; public abstract class AbstractEnrichMissingPublication extends UpdateMatcher { public AbstractEnrichMissingPublication(final Topic topic) { - super(true, + super(10, rel -> topic, (p, rel) -> p.getPublications().add(rel), - rel -> rel.getOriginalId()); + rel -> rel.getOpenaireId()); } @@ -27,18 +29,22 @@ public abstract class AbstractEnrichMissingPublication extends UpdateMatcher= BrokerConstants.MAX_LIST_SIZE) { + return new ArrayList<>(); + } + final Set existingPublications = target .getPublications() .stream() .filter(rel -> filterByType(rel.getRelType())) - .map(OaBrokerRelatedPublication::getOriginalId) + .map(OaBrokerRelatedPublication::getOpenaireId) .collect(Collectors.toSet()); return source .getPublications() .stream() .filter(rel -> filterByType(rel.getRelType())) - .filter(p -> !existingPublications.contains(p.getOriginalId())) + .filter(p -> !existingPublications.contains(p.getOpenaireId())) .collect(Collectors.toList()); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java index 452caa503..a638024bc 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java @@ -13,10 +13,10 @@ public class EnrichMissingSoftware extends UpdateMatcher { public EnrichMissingSoftware() { - super(true, + super(10, s -> Topic.ENRICH_MISSING_SOFTWARE, (p, s) -> p.getSoftwares().add(s), - s -> s.getName()); + s -> s.getOpenaireId()); } @Override diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java index aaffe1249..a6cd34359 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware; +import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -9,14 +10,15 @@ import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; public class EnrichMoreSoftware extends UpdateMatcher { public EnrichMoreSoftware() { - super(true, + super(10, s -> Topic.ENRICH_MORE_SOFTWARE, (p, s) -> p.getSoftwares().add(s), - s -> s.getName()); + s -> s.getOpenaireId()); } @Override @@ -24,6 +26,10 @@ public class EnrichMoreSoftware extends UpdateMatcher { final OaBrokerMainEntity source, final OaBrokerMainEntity target) { + if (target.getSoftwares().size() >= BrokerConstants.MAX_LIST_SIZE) { + return new ArrayList<>(); + } + final Set existingSoftwares = source .getSoftwares() .stream() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java index 73462bae8..b61696e45 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java @@ -12,7 +12,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; public class EnrichMissingAbstract extends UpdateMatcher { public EnrichMissingAbstract() { - super(false, + super(1, s -> Topic.ENRICH_MISSING_ABSTRACT, (p, s) -> p.getAbstracts().add(s), s -> s); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java index 2a01188a9..e834d1dde 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.broker.oa.matchers.simple; +import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -11,11 +12,12 @@ import eu.dnetlib.broker.objects.OaBrokerAuthor; import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; public class EnrichMissingAuthorOrcid extends UpdateMatcher { public EnrichMissingAuthorOrcid() { - super(true, + super(40, aut -> Topic.ENRICH_MISSING_AUTHOR_ORCID, (p, aut) -> p.getCreators().add(aut), aut -> aut.getOrcid()); @@ -25,6 +27,10 @@ public class EnrichMissingAuthorOrcid extends UpdateMatcher { protected List findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) { + if (target.getCreators().size() >= BrokerConstants.MAX_LIST_SIZE) { + return new ArrayList<>(); + } + final Set existingOrcids = target .getCreators() .stream() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java index 487382957..8e4f2fcf4 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.broker.oa.matchers.simple; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; @@ -14,7 +15,7 @@ import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; public class EnrichMissingOpenAccess extends UpdateMatcher { public EnrichMissingOpenAccess() { - super(true, + super(20, i -> Topic.ENRICH_MISSING_OA_VERSION, (p, i) -> p.getInstances().add(i), OaBrokerInstance::getUrl); @@ -23,6 +24,11 @@ public class EnrichMissingOpenAccess extends UpdateMatcher { @Override protected List findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) { + + if (target.getInstances().size() >= BrokerConstants.MAX_LIST_SIZE) { + return new ArrayList<>(); + } + final long count = target .getInstances() .stream() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java index ee1617b1e..4e4003890 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java @@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; public class EnrichMissingPid extends UpdateMatcher { public EnrichMissingPid() { - super(true, + super(10, pid -> Topic.ENRICH_MISSING_PID, (p, pid) -> p.getPids().add(pid), pid -> pid.getType() + "::" + pid.getValue()); @@ -22,9 +22,8 @@ public class EnrichMissingPid extends UpdateMatcher { @Override protected List findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) { - final long count = target.getPids().size(); - if (count > 0) { + if (target.getPids().size() > 0) { return Arrays.asList(); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java index 2c0533fa3..e7b65dad8 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java @@ -12,7 +12,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; public class EnrichMissingPublicationDate extends UpdateMatcher { public EnrichMissingPublicationDate() { - super(false, + super(1, date -> Topic.ENRICH_MISSING_PUBLICATION_DATE, (p, date) -> p.setPublicationdate(date), s -> s); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java index 9ab9fce48..26ebbb7c0 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.broker.oa.matchers.simple; +import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -9,11 +10,12 @@ import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.broker.objects.OaBrokerTypedValue; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; public class EnrichMissingSubject extends UpdateMatcher { public EnrichMissingSubject() { - super(true, + super(20, s -> Topic.fromPath("ENRICH/MISSING/SUBJECT/" + s.getType()), (p, s) -> p.getSubjects().add(s), s -> subjectAsString(s)); @@ -22,6 +24,11 @@ public class EnrichMissingSubject extends UpdateMatcher { @Override protected List findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) { + + if (target.getSubjects().size() >= BrokerConstants.MAX_LIST_SIZE) { + return new ArrayList<>(); + } + final Set existingSubject = target .getSubjects() .stream() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java index e90a8f201..46f6fa80c 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.broker.oa.matchers.simple; +import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -14,7 +15,7 @@ import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; public class EnrichMoreOpenAccess extends UpdateMatcher { public EnrichMoreOpenAccess() { - super(true, + super(20, i -> Topic.ENRICH_MORE_OA_VERSION, (p, i) -> p.getInstances().add(i), OaBrokerInstance::getUrl); @@ -23,6 +24,11 @@ public class EnrichMoreOpenAccess extends UpdateMatcher { @Override protected List findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) { + + if (target.getInstances().size() >= BrokerConstants.MAX_LIST_SIZE) { + return new ArrayList<>(); + } + final Set urls = target .getInstances() .stream() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java index 43b4f0628..609437b9d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.broker.oa.matchers.simple; +import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -9,11 +10,12 @@ import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.broker.objects.OaBrokerTypedValue; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; public class EnrichMorePid extends UpdateMatcher { public EnrichMorePid() { - super(true, + super(20, pid -> Topic.ENRICH_MORE_PID, (p, pid) -> p.getPids().add(pid), pid -> pidAsString(pid)); @@ -22,6 +24,11 @@ public class EnrichMorePid extends UpdateMatcher { @Override protected List findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) { + + if (target.getPids().size() >= BrokerConstants.MAX_LIST_SIZE) { + return new ArrayList<>(); + } + final Set existingPids = target .getPids() .stream() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java index 04fb494ef..bbe6609d7 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.broker.oa.matchers.simple; +import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -9,11 +10,12 @@ import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.broker.objects.OaBrokerTypedValue; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; public class EnrichMoreSubject extends UpdateMatcher { public EnrichMoreSubject() { - super(true, + super(20, s -> Topic.fromPath("ENRICH/MORE/SUBJECT/" + s.getType()), (p, s) -> p.getSubjects().add(s), s -> subjectAsString(s)); @@ -22,6 +24,11 @@ public class EnrichMoreSubject extends UpdateMatcher { @Override protected List findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) { + + if (target.getSubjects().size() >= BrokerConstants.MAX_LIST_SIZE) { + return new ArrayList<>(); + } + final Set existingSubjects = target .getSubjects() .stream() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java index 49c46c7f0..5308b9dff 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java @@ -17,6 +17,12 @@ public class BrokerConstants { public static final float MIN_TRUST = 0.25f; public static final float MAX_TRUST = 1.00f; + public static final int MAX_NUMBER_OF_RELS = 20; + + public static final int MAX_STRING_SIZE = 3000; + + public static final int MAX_LIST_SIZE = 50; + public static Class[] getModelClasses() { final Set> list = new HashSet<>(); list.addAll(Arrays.asList(ModelSupport.getOafModelClasses())); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java index 15a1ddd88..2d0106a7a 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java @@ -4,7 +4,9 @@ package eu.dnetlib.dhp.broker.oa.util; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; +import org.apache.spark.util.LongAccumulator; import com.fasterxml.jackson.databind.ObjectMapper; @@ -32,4 +34,32 @@ public class ClusterUtils { .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); } + public static boolean isDedupRoot(final String id) { + return id.contains("dedup_wf_"); + } + + public static final boolean isValidResultResultClass(final String s) { + return s.equals("isReferencedBy") + || s.equals("isRelatedTo") + || s.equals("references") + || s.equals("isSupplementedBy") + || s.equals("isSupplementedTo"); + } + + public static T incrementAccumulator(final T o, final LongAccumulator acc) { + if (acc != null) { + acc.add(1); + } + return o; + } + + public static void save(final Dataset dataset, final String path, final Class clazz, + final LongAccumulator acc) { + dataset + .map(o -> ClusterUtils.incrementAccumulator(o, acc), Encoders.bean(clazz)) + .write() + .mode(SaveMode.Overwrite) + .json(path); + } + } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java index b61d5e7cc..1ce84283a 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java @@ -123,7 +123,8 @@ public class ConversionUtils { res.setCollectedFromName(mappedFirst(result.getCollectedfrom(), KeyValue::getValue)); res.setPids(mappedList(result.getPid(), ConversionUtils::oafPidToBrokerPid)); res.setInstances(flatMappedList(result.getInstance(), ConversionUtils::oafInstanceToBrokerInstances)); - res.setExternalReferences(mappedList(result.getExternalReference(), ConversionUtils::oafExtRefToBrokerExtRef)); + res + .setExternalReferences(mappedList(result.getExternalReference(), ConversionUtils::oafExtRefToBrokerExtRef)); return res; } @@ -245,7 +246,13 @@ public class ConversionUtils { private static List fieldList(final List> fl) { return fl != null - ? fl.stream().map(Field::getValue).filter(StringUtils::isNotBlank).collect(Collectors.toList()) + ? fl + .stream() + .map(Field::getValue) + .map(s -> StringUtils.abbreviate(s, BrokerConstants.MAX_STRING_SIZE)) + .filter(StringUtils::isNotBlank) + .limit(BrokerConstants.MAX_LIST_SIZE) + .collect(Collectors.toList()) : new ArrayList<>(); } @@ -255,6 +262,7 @@ public class ConversionUtils { .stream() .map(StructuredProperty::getValue) .filter(StringUtils::isNotBlank) + .limit(BrokerConstants.MAX_LIST_SIZE) .collect(Collectors.toList()) : new ArrayList<>(); } @@ -280,6 +288,7 @@ public class ConversionUtils { .stream() .map(func::apply) .filter(Objects::nonNull) + .limit(BrokerConstants.MAX_LIST_SIZE) .collect(Collectors.toList()); } @@ -293,6 +302,7 @@ public class ConversionUtils { .map(func::apply) .flatMap(List::stream) .filter(Objects::nonNull) + .limit(BrokerConstants.MAX_LIST_SIZE) .collect(Collectors.toList()); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java index 7451e5891..e7abae68b 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java @@ -3,6 +3,12 @@ package eu.dnetlib.dhp.broker.oa.util; import java.util.ArrayList; import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.spark.util.LongAccumulator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.EventFactory; @@ -35,7 +41,9 @@ import eu.dnetlib.pace.config.DedupConfig; public class EventFinder { - private static List> matchers = new ArrayList<>(); + private static final Logger log = LoggerFactory.getLogger(EventFinder.class); + + private static final List> matchers = new ArrayList<>(); static { matchers.add(new EnrichMissingAbstract()); matchers.add(new EnrichMissingAuthorOrcid()); @@ -65,22 +73,48 @@ public class EventFinder { matchers.add(new EnrichMissingAbstract()); } - public static EventGroup generateEvents(final ResultGroup results, final DedupConfig dedupConfig) { + public static EventGroup generateEvents(final ResultGroup results, + final Set dsIdWhitelist, + final Set dsIdBlacklist, + final Set dsTypeWhitelist, + final DedupConfig dedupConfig, + final Map accumulators) { + final List> list = new ArrayList<>(); for (final OaBrokerMainEntity target : results.getData()) { - for (final UpdateMatcher matcher : matchers) { - list.addAll(matcher.searchUpdatesForRecord(target, results.getData(), dedupConfig)); + if (verifyTarget(target, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist)) { + for (final UpdateMatcher matcher : matchers) { + list.addAll(matcher.searchUpdatesForRecord(target, results.getData(), dedupConfig, accumulators)); + } } } return asEventGroup(list); } + private static boolean verifyTarget(final OaBrokerMainEntity target, + final Set dsIdWhitelist, + final Set dsIdBlacklist, + final Set dsTypeWhitelist) { + + if (dsIdWhitelist.contains(target.getCollectedFromId())) { + return true; + } else if (dsIdBlacklist.contains(target.getCollectedFromId())) { + return false; + } else { + return dsTypeWhitelist.contains(target.getCollectedFromType()); + } + } + private static EventGroup asEventGroup(final List> list) { final EventGroup events = new EventGroup(); list.stream().map(EventFactory::newBrokerEvent).forEach(events::addElement); return events; } + public static List> getMatchers() { + return matchers; + } + } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventGroup.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventGroup.java index 25c7698a0..503e31ae1 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventGroup.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventGroup.java @@ -14,12 +14,16 @@ public class EventGroup implements Serializable { */ private static final long serialVersionUID = 765977943803533130L; - private final List data = new ArrayList<>(); + private List data = new ArrayList<>(); public List getData() { return data; } + public void setData(final List data) { + this.data = data; + } + public EventGroup addElement(final Event elem) { data.add(elem); return this; diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java index 25d0d2bca..0586b681e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java @@ -111,8 +111,9 @@ public final class UpdateInfo { final OaBrokerMainEntity hl = new OaBrokerMainEntity(); compileHighlight.accept(hl, getHighlightValue()); - final String provId = getSource().getOriginalId(); + final String provId = getSource().getOpenaireId(); final String provRepo = getSource().getCollectedFromName(); + final String provType = getSource().getCollectedFromType(); final String provUrl = getSource() .getInstances() @@ -122,7 +123,7 @@ public final class UpdateInfo { .orElse(null); ; - final OaBrokerProvenance provenance = new OaBrokerProvenance(provId, provRepo, provUrl); + final OaBrokerProvenance provenance = new OaBrokerProvenance(provId, provRepo, provType, provUrl); final OaBrokerEventPayload res = new OaBrokerEventPayload(); res.setResult(target); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/AddDatasourceTypeAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/AddDatasourceTypeAggregator.java new file mode 100644 index 000000000..ccd15c8c6 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/AddDatasourceTypeAggregator.java @@ -0,0 +1,59 @@ + +package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; + +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.expressions.Aggregator; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import scala.Tuple2; + +public class AddDatasourceTypeAggregator + extends Aggregator, OaBrokerMainEntity, OaBrokerMainEntity> { + + /** + * + */ + private static final long serialVersionUID = 8788588975496014728L; + + @Override + public OaBrokerMainEntity zero() { + return new OaBrokerMainEntity(); + } + + @Override + public OaBrokerMainEntity finish(final OaBrokerMainEntity g) { + return g; + } + + @Override + public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, + final Tuple2 t) { + final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1; + if (t._2 != null && StringUtils.isNotBlank(t._2.getType())) { + res.setCollectedFromType(t._2.getType()); + } + return res; + + } + + @Override + public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { + if (StringUtils.isNotBlank(g1.getOpenaireId()) && StringUtils.isNotBlank(g1.getCollectedFromType())) { + return g1; + } else { + return g2; + } + } + + @Override + public Encoder bufferEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + + @Override + public Encoder outputEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/OaBrokerMainEntityAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/OaBrokerMainEntityAggregator.java deleted file mode 100644 index 6a2d9b06d..000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/OaBrokerMainEntityAggregator.java +++ /dev/null @@ -1,71 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; - -import org.apache.spark.sql.Encoder; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.expressions.Aggregator; - -import eu.dnetlib.broker.objects.OaBrokerMainEntity; -import scala.Tuple2; - -public class OaBrokerMainEntityAggregator - extends Aggregator, OaBrokerMainEntity, OaBrokerMainEntity> { - - /** - * - */ - private static final long serialVersionUID = -3687878788861013488L; - - @Override - public OaBrokerMainEntity zero() { - return new OaBrokerMainEntity(); - } - - @Override - public OaBrokerMainEntity finish(final OaBrokerMainEntity g) { - return g; - } - - @Override - public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2 t) { - if (g.getOriginalId() == null) { - return t._1; - } else if (t._2 instanceof RelatedSoftware) { - g.getSoftwares().add(((RelatedSoftware) t._2).getRelSoftware()); - } else if (t._2 instanceof RelatedDataset) { - g.getDatasets().add(((RelatedDataset) t._2).getRelDataset()); - } else if (t._2 instanceof RelatedPublication) { - g.getPublications().add(((RelatedPublication) t._2).getRelPublication()); - } else if (t._2 instanceof RelatedProject) { - g.getProjects().add(((RelatedProject) t._2).getRelProject()); - } else { - throw new RuntimeException("Invalid Object: " + t._2.getClass()); - } - return g; - - } - - @Override - public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { - if (g1.getOriginalId() != null) { - g1.getSoftwares().addAll(g2.getSoftwares()); - g1.getDatasets().addAll(g2.getDatasets()); - g1.getPublications().addAll(g2.getPublications()); - g1.getProjects().addAll(g2.getProjects()); - return g1; - } else { - return g2; - } - } - - @Override - public Encoder bufferEncoder() { - return Encoders.bean(OaBrokerMainEntity.class); - } - - @Override - public Encoder outputEncoder() { - return Encoders.bean(OaBrokerMainEntity.class); - } - -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java index daf75ea2e..0925e3291 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java @@ -11,16 +11,15 @@ public class RelatedDataset implements Serializable { * */ private static final long serialVersionUID = 774487705184038324L; + private String source; - private String relType; private OaBrokerRelatedDataset relDataset; public RelatedDataset() { } - public RelatedDataset(final String source, final String relType, final OaBrokerRelatedDataset relDataset) { + public RelatedDataset(final String source, final OaBrokerRelatedDataset relDataset) { this.source = source; - this.relType = relType; this.relDataset = relDataset; } @@ -32,14 +31,6 @@ public class RelatedDataset implements Serializable { this.source = source; } - public String getRelType() { - return relType; - } - - public void setRelType(final String relType) { - this.relType = relType; - } - public OaBrokerRelatedDataset getRelDataset() { return relDataset; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasetAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasetAggregator.java new file mode 100644 index 000000000..45000f6f3 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasetAggregator.java @@ -0,0 +1,68 @@ + +package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; + +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.expressions.Aggregator; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; +import scala.Tuple2; + +public class RelatedDatasetAggregator + extends Aggregator, OaBrokerMainEntity, OaBrokerMainEntity> { + + /** + * + */ + private static final long serialVersionUID = 6969761680131482557L; + + @Override + public OaBrokerMainEntity zero() { + return new OaBrokerMainEntity(); + } + + @Override + public OaBrokerMainEntity finish(final OaBrokerMainEntity g) { + return g; + } + + @Override + public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2 t) { + final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1; + if (t._2 != null && res.getDatasets().size() < BrokerConstants.MAX_NUMBER_OF_RELS) { + res.getDatasets().add(t._2.getRelDataset()); + } + return res; + + } + + @Override + public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { + if (StringUtils.isNotBlank(g1.getOpenaireId())) { + final int availables = BrokerConstants.MAX_NUMBER_OF_RELS - g1.getDatasets().size(); + if (availables > 0) { + if (g2.getDatasets().size() <= availables) { + g1.getDatasets().addAll(g2.getDatasets()); + } else { + g1.getDatasets().addAll(g2.getDatasets().subList(0, availables)); + } + } + return g1; + } else { + return g2; + } + } + + @Override + public Encoder bufferEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + + @Override + public Encoder outputEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java index 4116c8c77..74d19fe9d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java @@ -13,15 +13,13 @@ public class RelatedProject implements Serializable { private static final long serialVersionUID = 4941437626549329870L; private String source; - private String relType; private OaBrokerProject relProject; public RelatedProject() { } - public RelatedProject(final String source, final String relType, final OaBrokerProject relProject) { + public RelatedProject(final String source, final OaBrokerProject relProject) { this.source = source; - this.relType = relType; this.relProject = relProject; } @@ -33,14 +31,6 @@ public class RelatedProject implements Serializable { this.source = source; } - public String getRelType() { - return relType; - } - - public void setRelType(final String relType) { - this.relType = relType; - } - public OaBrokerProject getRelProject() { return relProject; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProjectAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProjectAggregator.java new file mode 100644 index 000000000..787217837 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProjectAggregator.java @@ -0,0 +1,68 @@ + +package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; + +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.expressions.Aggregator; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; +import scala.Tuple2; + +public class RelatedProjectAggregator + extends Aggregator, OaBrokerMainEntity, OaBrokerMainEntity> { + + /** + * + */ + private static final long serialVersionUID = 8559808519152275763L; + + @Override + public OaBrokerMainEntity zero() { + return new OaBrokerMainEntity(); + } + + @Override + public OaBrokerMainEntity finish(final OaBrokerMainEntity g) { + return g; + } + + @Override + public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2 t) { + final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1; + if (t._2 != null && res.getProjects().size() < BrokerConstants.MAX_NUMBER_OF_RELS) { + res.getProjects().add(t._2.getRelProject()); + } + return res; + + } + + @Override + public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { + if (StringUtils.isNotBlank(g1.getOpenaireId())) { + final int availables = BrokerConstants.MAX_NUMBER_OF_RELS - g1.getProjects().size(); + if (availables > 0) { + if (g2.getProjects().size() <= availables) { + g1.getProjects().addAll(g2.getProjects()); + } else { + g1.getProjects().addAll(g2.getProjects().subList(0, availables)); + } + } + return g1; + } else { + return g2; + } + } + + @Override + public Encoder bufferEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + + @Override + public Encoder outputEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java index 9e222a952..ed6aeeab1 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java @@ -13,16 +13,13 @@ public class RelatedPublication implements Serializable { private static final long serialVersionUID = 9021609640411395128L; private String source; - private String relType; private OaBrokerRelatedPublication relPublication; public RelatedPublication() { } - public RelatedPublication(final String source, final String relType, - final OaBrokerRelatedPublication relPublication) { + public RelatedPublication(final String source, final OaBrokerRelatedPublication relPublication) { this.source = source; - this.relType = relType; this.relPublication = relPublication; } @@ -34,14 +31,6 @@ public class RelatedPublication implements Serializable { this.source = source; } - public String getRelType() { - return relType; - } - - public void setRelType(final String relType) { - this.relType = relType; - } - public OaBrokerRelatedPublication getRelPublication() { return relPublication; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublicationAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublicationAggregator.java new file mode 100644 index 000000000..2289ebe36 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublicationAggregator.java @@ -0,0 +1,70 @@ + +package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; + +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.expressions.Aggregator; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; +import scala.Tuple2; + +public class RelatedPublicationAggregator + extends Aggregator, OaBrokerMainEntity, OaBrokerMainEntity> { + + /** + * + */ + private static final long serialVersionUID = 4656934981558135919L; + + @Override + public OaBrokerMainEntity zero() { + return new OaBrokerMainEntity(); + } + + @Override + public OaBrokerMainEntity finish(final OaBrokerMainEntity g) { + return g; + } + + @Override + public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, + final Tuple2 t) { + final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1; + if (t._2 != null && res.getPublications().size() < BrokerConstants.MAX_NUMBER_OF_RELS) { + res.getPublications().add(t._2.getRelPublication()); + } + return res; + + } + + @Override + public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { + if (StringUtils.isNotBlank(g1.getOpenaireId())) { + final int availables = BrokerConstants.MAX_NUMBER_OF_RELS - g1.getPublications().size(); + if (availables > 0) { + if (g2.getPublications().size() <= availables) { + g1.getPublications().addAll(g2.getPublications()); + } else { + g1.getPublications().addAll(g2.getPublications().subList(0, availables)); + } + } + return g1; + + } else { + return g2; + } + } + + @Override + public Encoder bufferEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + + @Override + public Encoder outputEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java index 2f3b8668c..0aa3a4045 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java @@ -11,16 +11,15 @@ public class RelatedSoftware implements Serializable { * */ private static final long serialVersionUID = 7573383356943300157L; + private String source; - private String relType; private OaBrokerRelatedSoftware relSoftware; public RelatedSoftware() { } - public RelatedSoftware(final String source, final String relType, final OaBrokerRelatedSoftware relSoftware) { + public RelatedSoftware(final String source, final OaBrokerRelatedSoftware relSoftware) { this.source = source; - this.relType = relType; this.relSoftware = relSoftware; } @@ -32,14 +31,6 @@ public class RelatedSoftware implements Serializable { this.source = source; } - public String getRelType() { - return relType; - } - - public void setRelType(final String relType) { - this.relType = relType; - } - public OaBrokerRelatedSoftware getRelSoftware() { return relSoftware; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftwareAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftwareAggregator.java new file mode 100644 index 000000000..fedb3c9e9 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftwareAggregator.java @@ -0,0 +1,68 @@ + +package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; + +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.expressions.Aggregator; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; +import scala.Tuple2; + +public class RelatedSoftwareAggregator + extends Aggregator, OaBrokerMainEntity, OaBrokerMainEntity> { + + /** + * + */ + private static final long serialVersionUID = -8987959389106443702L; + + @Override + public OaBrokerMainEntity zero() { + return new OaBrokerMainEntity(); + } + + @Override + public OaBrokerMainEntity finish(final OaBrokerMainEntity g) { + return g; + } + + @Override + public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2 t) { + final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1; + if (t._2 != null && res.getSoftwares().size() < BrokerConstants.MAX_NUMBER_OF_RELS) { + res.getSoftwares().add(t._2.getRelSoftware()); + } + return res; + + } + + @Override + public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { + if (StringUtils.isNotBlank(g1.getOpenaireId())) { + final int availables = BrokerConstants.MAX_NUMBER_OF_RELS - g1.getSoftwares().size(); + if (availables > 0) { + if (g2.getSoftwares().size() <= availables) { + g1.getSoftwares().addAll(g2.getSoftwares()); + } else { + g1.getSoftwares().addAll(g2.getSoftwares().subList(0, availables)); + } + } + return g1; + } else { + return g2; + } + } + + @Override + public Encoder bufferEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + + @Override + public Encoder outputEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/SimpleDatasourceInfo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/SimpleDatasourceInfo.java new file mode 100644 index 000000000..966f63fa0 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/SimpleDatasourceInfo.java @@ -0,0 +1,40 @@ + +package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; + +import java.io.Serializable; + +public class SimpleDatasourceInfo implements Serializable { + + /** + * + */ + private static final long serialVersionUID = 2996609859416024734L; + + private String id; + private String type; + + public SimpleDatasourceInfo() { + } + + public SimpleDatasourceInfo(final String id, final String type) { + this.id = id; + this.type = type; + } + + public String getId() { + return id; + } + + public void setId(final String id) { + this.id = id; + } + + public String getType() { + return type; + } + + public void setType(final String type) { + this.type = type; + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml index 9783fcab6..a0c7b00db 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml @@ -17,7 +17,29 @@ dedupConfProfId the id of a valid Dedup Configuration Profile - + + datasourceIdWhitelist + - + a white list (comma separeted, - for empty list) of datasource ids + + + datasourceTypeWhitelist + - + a white list (comma separeted, - for empty list) of datasource types + + + datasourceIdBlacklist + - + a black list (comma separeted, - for empty list) of datasource ids + + + esIndexName + the elasticsearch index name + + + esIndexHost + the elasticsearch host + sparkDriverMemory memory for driver process @@ -88,11 +110,12 @@ - + + + - @@ -118,7 +141,32 @@ + + + + yarn + cluster + PrepareRelatedDatasourcesJob + eu.dnetlib.dhp.broker.oa.PrepareRelatedDatasourcesJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + yarn @@ -190,7 +238,7 @@ - + yarn @@ -214,15 +262,111 @@ - - + + - + yarn cluster - JoinEntitiesJob - eu.dnetlib.dhp.broker.oa.JoinEntitiesJob + JoinStep0 + eu.dnetlib.dhp.broker.oa.JoinStep0Job + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + yarn + cluster + JoinStep1 + eu.dnetlib.dhp.broker.oa.JoinStep1Job + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + yarn + cluster + JoinStep2 + eu.dnetlib.dhp.broker.oa.JoinStep2Job + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + yarn + cluster + JoinStep3 + eu.dnetlib.dhp.broker.oa.JoinStep3Job + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + yarn + cluster + JoinStep4 + eu.dnetlib.dhp.broker.oa.JoinStep4Job dhp-broker-events-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -282,10 +426,37 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 - --graphPath${graphInputPath} --workingPath${workingPath} --isLookupUrl${isLookupUrl} --dedupConfProfile${dedupConfProfId} + --datasourceIdWhitelist${datasourceIdWhitelist} + --datasourceTypeWhitelist${datasourceTypeWhitelist} + --datasourceIdBlacklist${datasourceIdBlacklist} + + + + + + + + yarn + cluster + IndexOnESJob + eu.dnetlib.dhp.broker.oa.IndexOnESJob + dhp-broker-events-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.dynamicAllocation.maxExecutors="2" + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --workingPath${workingPath} + --index${esIndexName} + --esHost${esIndexHost} diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json index d185bc73d..c545884f9 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json @@ -1,10 +1,4 @@ [ - { - "paramName": "g", - "paramLongName": "graphPath", - "paramDescription": "the path where there the graph is stored", - "paramRequired": true - }, { "paramName": "o", "paramLongName": "workingPath", @@ -22,5 +16,23 @@ "paramLongName": "dedupConfProfile", "paramDescription": "the id of a valid Dedup Configuration Profile", "paramRequired": true + }, + { + "paramName": "datasourceIdWhitelist", + "paramLongName": "datasourceIdWhitelist", + "paramDescription": "a white list (comma separeted, - for empty list) of datasource ids", + "paramRequired": true + }, + { + "paramName": "datasourceTypeWhitelist", + "paramLongName": "datasourceTypeWhitelist", + "paramDescription": "a white list (comma separeted, - for empty list) of datasource types", + "paramRequired": true + }, + { + "paramName": "datasourceIdBlacklist", + "paramLongName": "datasourceIdBlacklist", + "paramDescription": "a black list (comma separeted, - for empty list) of datasource ids", + "paramRequired": true } ] diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_es.json b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_es.json new file mode 100644 index 000000000..ac1dbf786 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_es.json @@ -0,0 +1,20 @@ +[ + { + "paramName": "o", + "paramLongName": "workingPath", + "paramDescription": "the workinh path", + "paramRequired": true + }, + { + "paramName": "idx", + "paramLongName": "index", + "paramDescription": "the ES index", + "paramRequired": true + }, + { + "paramName": "es", + "paramLongName": "esHost", + "paramDescription": "the ES host", + "paramRequired": true + } +] diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/config-default.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/config-default.xml new file mode 100644 index 000000000..2e0ed9aee --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml new file mode 100644 index 000000000..b38290448 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml @@ -0,0 +1,110 @@ + + + + + graphInputPath + the path where the graph is stored + + + workingPath + the path where the the generated data will be stored + + + isLookupUrl + the address of the lookUp service + + + dedupConfProfId + the id of a valid Dedup Configuration Profile + + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + yarn + cluster + IndexOnESJob + eu.dnetlib.dhp.broker.oa.IndexOnESJob + dhp-broker-events-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.dynamicAllocation.maxExecutors="2" + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --workingPath${workingPath} + --index${esIndexName} + --esHost${esIndexHost} + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java new file mode 100644 index 000000000..93bc5617f --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java @@ -0,0 +1,125 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Arrays; +import java.util.Collection; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; + +class UpdateMatcherTest { + + UpdateMatcher matcher = new EnrichMissingPublicationDate(); + + @BeforeEach + void setUp() throws Exception { + } + + @Test + void testSearchUpdatesForRecord_1() { + final OaBrokerMainEntity res = new OaBrokerMainEntity(); + final OaBrokerMainEntity p1 = new OaBrokerMainEntity(); + final OaBrokerMainEntity p2 = new OaBrokerMainEntity(); + final OaBrokerMainEntity p3 = new OaBrokerMainEntity(); + final OaBrokerMainEntity p4 = new OaBrokerMainEntity(); + + final Collection> list = matcher + .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null); + + assertTrue(list.isEmpty()); + } + + @Test + void testSearchUpdatesForRecord_2() { + final OaBrokerMainEntity res = new OaBrokerMainEntity(); + final OaBrokerMainEntity p1 = new OaBrokerMainEntity(); + final OaBrokerMainEntity p2 = new OaBrokerMainEntity(); + final OaBrokerMainEntity p3 = new OaBrokerMainEntity(); + final OaBrokerMainEntity p4 = new OaBrokerMainEntity(); + + res.setPublicationdate("2018"); + + final Collection> list = matcher + .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null); + + assertTrue(list.isEmpty()); + } + + @Test + void testSearchUpdatesForRecord_3() { + final OaBrokerMainEntity res = new OaBrokerMainEntity(); + final OaBrokerMainEntity p1 = new OaBrokerMainEntity(); + final OaBrokerMainEntity p2 = new OaBrokerMainEntity(); + final OaBrokerMainEntity p3 = new OaBrokerMainEntity(); + final OaBrokerMainEntity p4 = new OaBrokerMainEntity(); + + p2.setPublicationdate("2018"); + + final Collection> list = matcher + .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null); + + assertTrue(list.size() == 1); + } + + @Test + void testSearchUpdatesForRecord_4() { + final OaBrokerMainEntity res = new OaBrokerMainEntity(); + final OaBrokerMainEntity p1 = new OaBrokerMainEntity(); + final OaBrokerMainEntity p2 = new OaBrokerMainEntity(); + final OaBrokerMainEntity p3 = new OaBrokerMainEntity(); + final OaBrokerMainEntity p4 = new OaBrokerMainEntity(); + + res.setPublicationdate("2018"); + p2.setPublicationdate("2018"); + + final Collection> list = matcher + .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null); + + assertTrue(list.isEmpty()); + } + + @Test + void testSearchUpdatesForRecord_5() { + final OaBrokerMainEntity res = new OaBrokerMainEntity(); + final OaBrokerMainEntity p1 = new OaBrokerMainEntity(); + final OaBrokerMainEntity p2 = new OaBrokerMainEntity(); + final OaBrokerMainEntity p3 = new OaBrokerMainEntity(); + final OaBrokerMainEntity p4 = new OaBrokerMainEntity(); + res.setPublicationdate("2018"); + p1.setPublicationdate("2018"); + p2.setPublicationdate("2018"); + p3.setPublicationdate("2018"); + p4.setPublicationdate("2018"); + + final Collection> list = matcher + .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null); + + assertTrue(list.isEmpty()); + } + + @Test + void testSearchUpdatesForRecord_6() { + final OaBrokerMainEntity res = new OaBrokerMainEntity(); + final OaBrokerMainEntity p1 = new OaBrokerMainEntity(); + final OaBrokerMainEntity p2 = new OaBrokerMainEntity(); + final OaBrokerMainEntity p3 = new OaBrokerMainEntity(); + final OaBrokerMainEntity p4 = new OaBrokerMainEntity(); + + p1.setPublicationdate("2018"); + p2.setPublicationdate("2018"); + p3.setPublicationdate("2018"); + p4.setPublicationdate("2018"); + + final Collection> list = matcher + .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null); + + assertTrue(list.size() == 1); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDateTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDateTest.java new file mode 100644 index 000000000..77a19af4c --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDateTest.java @@ -0,0 +1,57 @@ + +package eu.dnetlib.dhp.broker.oa.matchers.simple; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.List; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; + +class EnrichMissingPublicationDateTest { + + final EnrichMissingPublicationDate matcher = new EnrichMissingPublicationDate(); + + @BeforeEach + void setUp() throws Exception { + } + + @Test + void testFindDifferences_1() { + final OaBrokerMainEntity source = new OaBrokerMainEntity(); + final OaBrokerMainEntity target = new OaBrokerMainEntity(); + final List list = matcher.findDifferences(source, target); + assertTrue(list.isEmpty()); + } + + @Test + void testFindDifferences_2() { + final OaBrokerMainEntity source = new OaBrokerMainEntity(); + final OaBrokerMainEntity target = new OaBrokerMainEntity(); + source.setPublicationdate("2018"); + final List list = matcher.findDifferences(source, target); + assertTrue(list.size() == 1); + } + + @Test + void testFindDifferences_3() { + final OaBrokerMainEntity source = new OaBrokerMainEntity(); + final OaBrokerMainEntity target = new OaBrokerMainEntity(); + target.setPublicationdate("2018"); + final List list = matcher.findDifferences(source, target); + assertTrue(list.isEmpty()); + } + + @Test + void testFindDifferences_4() { + final OaBrokerMainEntity source = new OaBrokerMainEntity(); + final OaBrokerMainEntity target = new OaBrokerMainEntity(); + source.setPublicationdate("2018"); + target.setPublicationdate("2018"); + final List list = matcher.findDifferences(source, target); + assertTrue(list.isEmpty()); + } + +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala index 7b21ecda2..1a45defb0 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala @@ -271,6 +271,26 @@ object DoiBoostMappingUtil { } + + def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String): StructuredProperty = { + val sp = new StructuredProperty + sp.setQualifier(createQualifier(classId,className, schemeId, schemeName)) + sp.setValue(value) + sp + + } + + + + def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String, dataInfo: DataInfo): StructuredProperty = { + val sp = new StructuredProperty + sp.setQualifier(createQualifier(classId,className, schemeId, schemeName)) + sp.setValue(value) + sp.setDataInfo(dataInfo) + sp + + } + def createSP(value: String, classId: String, schemeId: String): StructuredProperty = { val sp = new StructuredProperty sp.setQualifier(createQualifier(classId, schemeId)) @@ -279,6 +299,8 @@ object DoiBoostMappingUtil { } + + def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = { val sp = new StructuredProperty sp.setQualifier(createQualifier(classId, schemeId)) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala index 2419f86a3..7bb4686cf 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala @@ -129,16 +129,16 @@ case object ConversionUtil { val fieldOfStudy = item._2 if (fieldOfStudy != null && fieldOfStudy.subjects != null && fieldOfStudy.subjects.nonEmpty) { val p: List[StructuredProperty] = fieldOfStudy.subjects.flatMap(s => { - val s1 = createSP(s.DisplayName, "keyword", "dnet:subject_classification_typologies") + val s1 = createSP(s.DisplayName, "MAG","Microsoft Academic Graph classification", "dnet:subject_classification_typologies", "dnet:subject_classification_typologies") val di = DoiBoostMappingUtil.generateDataInfo(s.Score.toString) var resList: List[StructuredProperty] = List(s1) if (s.MainType.isDefined) { val maintp = s.MainType.get - val s2 = createSP(s.MainType.get, "keyword", "dnet:subject_classification_typologies") + val s2 = createSP(s.MainType.get, "MAG","Microsoft Academic Graph classification", "dnet:subject_classification_typologies", "dnet:subject_classification_typologies") s2.setDataInfo(di) resList = resList ::: List(s2) if (maintp.contains(".")) { - val s3 = createSP(maintp.split("\\.").head, "keyword", "dnet:subject_classification_typologies") + val s3 = createSP(maintp.split("\\.").head, "MAG","Microsoft Academic Graph classification", "dnet:subject_classification_typologies", "dnet:subject_classification_typologies") s3.setDataInfo(di) resList = resList ::: List(s3) } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkPreProcessMAG.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkPreProcessMAG.scala index f3d051bd6..a24f0e6bb 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkPreProcessMAG.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkPreProcessMAG.scala @@ -43,7 +43,7 @@ object SparkPreProcessMAG { val distinctPaper: Dataset[MagPapers] = spark.createDataset(result) distinctPaper.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/Papers_distinct") - logger.info("Phase 6) Enrich Publication with description") + logger.info("Phase 0) Enrich Publication with description") val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract] pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract") diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/intersection/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/intersection/oozie_app/workflow.xml index 34ba5d89d..bf91958cf 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/intersection/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/intersection/oozie_app/workflow.xml @@ -32,7 +32,7 @@ - + diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala index d31f80248..f62ac2b67 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala @@ -18,6 +18,9 @@ class CrossrefMappingTest { val mapper = new ObjectMapper() + + + @Test def testFunderRelationshipsMapping(): Unit = { val template = Source.fromInputStream(getClass.getResourceAsStream("article_funder_template.json")).mkString @@ -58,6 +61,27 @@ class CrossrefMappingTest { } + @Test + def testOrcidID() :Unit = { + val json = Source.fromInputStream(getClass.getResourceAsStream("orcid_data.json")).mkString + + + assertNotNull(json) + assertFalse(json.isEmpty); + + val resultList: List[Oaf] = Crossref2Oaf.convert(json) + + assertTrue(resultList.nonEmpty) + + val items = resultList.filter(p => p.isInstanceOf[Result]) + + + mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) + items.foreach(p => println(mapper.writeValueAsString(p))) + + + } + @Test def testEmptyTitle() :Unit = { val json = Source.fromInputStream(getClass.getResourceAsStream("empty_title.json")).mkString diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/orcid_data.json b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/orcid_data.json new file mode 100644 index 000000000..def546ddb --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/orcid_data.json @@ -0,0 +1,271 @@ +{ + "DOI":"10.1016/j.carbpol.2020.115930", + "issued":{ + "date-parts":[ + [ + 2020, + 4 + ] + ] + }, + "published-print":{ + "date-parts":[ + [ + 2020, + 4 + ] + ] + }, + "prefix":"10.1016", + "subject":[ + "Organic Chemistry", + "Materials Chemistry", + "Polymers and Plastics" + ], + "author":[ + { + "affiliation":[ + + ], + "given":"Lei", + "family":"Fang", + "sequence":"first" + }, + { + "affiliation":[ + + ], + "given":"Hua", + "family":"Lin", + "sequence":"additional" + }, + { + "affiliation":[ + + ], + "given":"Zhenfeng", + "family":"Wu", + "sequence":"additional" + }, + { + "affiliation":[ + + ], + "given":"Zhen", + "family":"Wang", + "sequence":"additional" + }, + { + "affiliation":[ + + ], + "given":"Xinxin", + "family":"Fan", + "sequence":"additional" + }, + { + "affiliation":[ + + ], + "given":"Ziting", + "family":"Cheng", + "sequence":"additional" + }, + { + "affiliation":[ + + ], + "given":"Xiaoya", + "family":"Hou", + "sequence":"additional" + }, + { + "authenticated-orcid":false, + "given":"Daquan", + "family":"Chen", + "sequence":"additional", + "affiliation":[ + + ], + "ORCID":"http://orcid.org/0000-0002-6796-0204" + } + ], + "reference-count":41, + "ISSN":[ + "0144-8617" + ], + "assertion":[ + { + "name":"publisher", + "value":"Elsevier", + "label":"This article is maintained by" + }, + { + "name":"articletitle", + "value":"In vitro/vivo evaluation of novel mitochondrial targeting charge-reversal polysaccharide-based antitumor nanoparticle", + "label":"Article Title" + }, + { + "name":"journaltitle", + "value":"Carbohydrate Polymers", + "label":"Journal Title" + }, + { + "name":"articlelink", + "value":"https://doi.org/10.1016/j.carbpol.2020.115930", + "label":"CrossRef DOI link to publisher maintained version" + }, + { + "name":"content_type", + "value":"article", + "label":"Content Type" + }, + { + "name":"copyright", + "value":"\\u00a9 2020 Elsevier Ltd. All rights reserved.", + "label":"Copyright" + } + ], + "member":"78", + "source":"Crossref", + "score":1.0, + "deposited":{ + "timestamp":1584590965000, + "date-time":"2020-03-19T04:09:25Z", + "date-parts":[ + [ + 2020, + 3, + 19 + ] + ] + }, + "indexed":{ + "timestamp":1584592912467, + "date-time":"2020-03-19T04:41:52Z", + "date-parts":[ + [ + 2020, + 3, + 19 + ] + ] + }, + "type":"journal-article", + "URL":"http://dx.doi.org/10.1016/j.carbpol.2020.115930", + "is-referenced-by-count":0, + "volume":"234", + "issn-type":[ + { + "type":"print", + "value":"0144-8617" + } + ], + "link":[ + { + "URL":"https://api.elsevier.com/content/article/PII:S0144861720301041?httpAccept=text/xml", + "intended-application":"text-mining", + "content-version":"vor", + "content-type":"text/xml" + }, + { + "URL":"https://api.elsevier.com/content/article/PII:S0144861720301041?httpAccept=text/plain", + "intended-application":"text-mining", + "content-version":"vor", + "content-type":"text/plain" + } + ], + "update-policy":"http://dx.doi.org/10.1016/elsevier_cm_policy", + "references-count":41, + "short-container-title":[ + "Carbohydrate Polymers" + ], + "publisher":"Elsevier BV", + "content-domain":{ + "domain":[ + "elsevier.com", + "sciencedirect.com" + ], + "crossmark-restriction":true + }, + "language":"en", + "license":[ + { + "URL":"https://www.elsevier.com/tdm/userlicense/1.0/", + "start":{ + "timestamp":1585699200000, + "date-time":"2020-04-01T00:00:00Z", + "date-parts":[ + [ + 2020, + 4, + 1 + ] + ] + }, + "content-version":"tdm", + "delay-in-days":0 + } + ], + "created":{ + "timestamp":1581759678000, + "date-time":"2020-02-15T09:41:18Z", + "date-parts":[ + [ + 2020, + 2, + 15 + ] + ] + }, + "title":[ + "In vitro/vivo evaluation of novel mitochondrial targeting charge-reversal polysaccharide-based antitumor nanoparticle" + ], + "alternative-id":[ + "S0144861720301041" + ], + "container-title":[ + "Carbohydrate Polymers" + ], + "funder":[ + { + "doi-asserted-by":"publisher", + "DOI":"10.13039/501100007129", + "name":"Natural Science Foundation of Shandong Province", + "award":[ + "ZR2019ZD24", + "ZR2019YQ30" + ] + }, + { + "doi-asserted-by":"publisher", + "DOI":"10.13039/100010449", + "name":"Ministry of Education, Libya", + "award":[ + + ] + }, + { + "doi-asserted-by":"publisher", + "DOI":"10.13039/501100012249", + "name":"Jiangxi University of Traditional Chinese Medicine", + "award":[ + "TCM-0906" + ] + }, + { + "name":"Taishan Scholar Program", + "award":[ + "qnts20161035" + ] + }, + { + "name":"Open fund project of Key Laboratory of Modern Preparation of TCM", + "award":[ + + ] + } + ], + "page":"115930", + "article-number":"115930" +} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java index 8f43ab1cf..e1c4b53b5 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java @@ -147,9 +147,23 @@ public class CleanGraphSparkJob { if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) { i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY); } + if (Objects.isNull(i.getRefereed())) { + i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS)); + } + } + } + if (Objects.nonNull(r.getAuthor())) { + boolean nullRank = r + .getAuthor() + .stream() + .anyMatch(a -> Objects.isNull(a.getRank())); + if (nullRank) { + int i = 1; + for (Author author : r.getAuthor()) { + author.setRank(i++); + } } } - if (value instanceof Publication) { } else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index 0bad89e9e..da2ba4723 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -50,8 +50,6 @@ import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.common.collect.Lists; - import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.DbClient; import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; @@ -106,6 +104,9 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i final String dbPassword = parser.get("postgresPassword"); log.info("postgresPassword: xxx"); + final String dbSchema = parser.get("dbschema"); + log.info("dbSchema {}: " + dbSchema); + final String isLookupUrl = parser.get("isLookupUrl"); log.info("isLookupUrl: {}", isLookupUrl); @@ -125,7 +126,11 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i smdbe.execute("queryDatasources.sql", smdbe::processDatasource); log.info("Processing projects..."); - smdbe.execute("queryProjects.sql", smdbe::processProject); + if (dbSchema.equalsIgnoreCase("beta")) { + smdbe.execute("queryProjects.sql", smdbe::processProject); + } else { + smdbe.execute("queryProjects_production.sql", smdbe::processProject); + } log.info("Processing orgs..."); smdbe.execute("queryOrganizations.sql", smdbe::processOrganization); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java index 58f068943..8ede40773 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java @@ -9,7 +9,15 @@ import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; -import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.ExtraInfo; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.Journal; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.OAIProvenance; +import eu.dnetlib.dhp.schema.oaf.OriginDescription; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.utils.DHPUtils; public class OafMapperUtils { @@ -89,7 +97,9 @@ public class OafMapperUtils { } public static StructuredProperty structuredProperty( - final String value, final Qualifier qualifier, final DataInfo dataInfo) { + final String value, + final Qualifier qualifier, + final DataInfo dataInfo) { if (value == null) { return null; } @@ -192,8 +202,12 @@ public class OafMapperUtils { } public static String createOpenaireId( - final int prefix, final String originalId, final boolean to_md5) { - if (to_md5) { + final int prefix, + final String originalId, + final boolean to_md5) { + if (StringUtils.isBlank(originalId)) { + return null; + } else if (to_md5) { final String nsPrefix = StringUtils.substringBefore(originalId, "::"); final String rest = StringUtils.substringAfter(originalId, "::"); return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest)); @@ -203,7 +217,9 @@ public class OafMapperUtils { } public static String createOpenaireId( - final String type, final String originalId, final boolean to_md5) { + final String type, + final String originalId, + final boolean to_md5) { switch (type) { case "datasource": return createOpenaireId(10, originalId, to_md5); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json index 4e838561d..6dfef32db 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json @@ -34,5 +34,11 @@ "paramLongName": "isLookupUrl", "paramDescription": "the url of the ISLookupService", "paramRequired": true + }, + { + "paramName": "dbschema", + "paramLongName": "dbschema", + "paramDescription": "the database schema according to the D-Net infrastructure (beta or production)", + "paramRequired": true } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml index 9a7e36570..d8b61b5ea 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml @@ -25,6 +25,11 @@ postgresPassword the password postgres + + + dbSchema + beta + the database schema according to the D-Net infrastructure (beta or production) mongoURL @@ -125,6 +130,7 @@ --postgresPassword${postgresPassword} --isLookupUrl${isLookupUrl} --actionclaims + --dbschema${dbSchema} @@ -175,6 +181,7 @@ --postgresUser${postgresUser} --postgresPassword${postgresPassword} --isLookupUrl${isLookupUrl} + --dbschema${dbSchema} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 9bd20303f..b1f0ecf0d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -111,6 +111,7 @@ public class MappersTest { assertNotNull(i.getAccessright()); assertEquals("OPEN", i.getAccessright().getClassid()); }); + assertEquals("0001", p.getInstance().get(0).getRefereed().getClassid()); assertNotNull(p.getBestaccessright()); assertEquals("OPEN", p.getBestaccessright().getClassid()); @@ -217,6 +218,7 @@ public class MappersTest { assertNotNull(i.getAccessright()); assertEquals("OPEN", i.getAccessright().getClassid()); }); + assertEquals("0001", d.getInstance().get(0).getRefereed().getClassid()); assertValidId(r1.getSource()); assertValidId(r1.getTarget()); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml index 2cb0ba1c7..ead22aa96 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml @@ -57,6 +57,7 @@ 10.3897/oneeco.2.e13718 https://oneecosystem.pensoft.net/article/13718/ One Ecosystem + 0001 diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml index 88ae9d106..5525a2753 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml @@ -90,6 +90,7 @@ corda_______::226852 + 0001s clazz, String outputPath) { - Dataset> relsByTarget = readPathRelation(spark, inputRelationsPath) + Dataset> relsByTarget = readPathRelation(spark, inputRelationsPath) .filter("dataInfo.deletedbyinference == false") .map( - (MapFunction>) r -> new Tuple2<>(r.getTarget(), r), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(SortableRelation.class))) + (MapFunction>) r -> new Tuple2<>(r.getTarget(), + r), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(Relation.class))) .cache(); Dataset> entities = readPathEntity(spark, inputEntityPath, clazz) @@ -129,7 +128,7 @@ public class CreateRelatedEntitiesJob_phase1 { relsByTarget .joinWith(entities, entities.col("_1").equalTo(relsByTarget.col("_1")), "inner") .map( - (MapFunction, Tuple2>, RelatedEntityWrapper>) t -> new RelatedEntityWrapper( + (MapFunction, Tuple2>, RelatedEntityWrapper>) t -> new RelatedEntityWrapper( t._1()._2(), t._2()._2()), Encoders.kryo(RelatedEntityWrapper.class)) .write() @@ -232,11 +231,11 @@ public class CreateRelatedEntitiesJob_phase1 { * @param relationPath * @return the Dataset containing all the relationships */ - private static Dataset readPathRelation( + private static Dataset readPathRelation( SparkSession spark, final String relationPath) { log.info("Reading relations from: {}", relationPath); - return spark.read().load(relationPath).as(Encoders.bean(SortableRelation.class)); + return spark.read().load(relationPath).as(Encoders.bean(Relation.class)); } private static void removeOutputDir(SparkSession spark, String path) { diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java index 5ef30d6e1..bfcc648a3 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java @@ -19,7 +19,6 @@ import org.apache.spark.sql.expressions.Aggregator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Lists; @@ -28,8 +27,6 @@ import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.oa.provision.model.JoinedEntity; import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport; import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper; -import eu.dnetlib.dhp.oa.provision.model.TypedRow; -import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; @@ -305,20 +302,6 @@ public class CreateRelatedEntitiesJob_phase2 { private static FilterFunction filterEmptyEntityFn() { return (FilterFunction) v -> Objects.nonNull(v.getEntity()); - /* - * return (FilterFunction) v -> Optional .ofNullable(v.getEntity()) .map(e -> - * StringUtils.isNotBlank(e.getId())) .orElse(false); - */ - } - - private static TypedRow getTypedRow(String type, OafEntity entity) - throws JsonProcessingException { - TypedRow t = new TypedRow(); - t.setType(type); - t.setDeleted(entity.getDataInfo().getDeletedbyinference()); - t.setId(entity.getId()); - t.setOaf(OBJECT_MAPPER.writeValueAsString(entity)); - return t; } private static void removeOutputDir(SparkSession spark, String path) { diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java index 6b184071a..601cf6449 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java @@ -4,34 +4,35 @@ package eu.dnetlib.dhp.oa.provision; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.util.*; -import java.util.function.Function; +import java.util.function.Supplier; +import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.*; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.rdd.RDD; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.*; +import org.apache.spark.sql.expressions.Aggregator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.clearspring.analytics.util.Lists; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Splitter; import com.google.common.collect.Iterables; -import com.google.common.collect.Iterators; +import com.google.common.collect.Maps; import com.google.common.collect.Sets; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.FunctionalInterfaceSupport; import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.provision.model.SortableRelation; +import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport; +import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey; import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner; -import scala.Function1; +import eu.dnetlib.dhp.schema.oaf.Relation; import scala.Tuple2; /** @@ -106,6 +107,8 @@ public class PrepareRelationsJob { log.info("maxRelations: {}", maxRelations); SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ProvisionModelSupport.getModelClasses()); runWithSparkSession( conf, @@ -129,31 +132,111 @@ public class PrepareRelationsJob { * @param maxRelations maximum number of allowed outgoing edges * @param relPartitions number of partitions for the output RDD */ - private static void prepareRelationsRDD( - SparkSession spark, String inputRelationsPath, String outputPath, Set relationFilter, int maxRelations, - int relPartitions) { + private static void prepareRelationsRDD(SparkSession spark, String inputRelationsPath, String outputPath, + Set relationFilter, int maxRelations, int relPartitions) { - RDD cappedRels = readPathRelationRDD(spark, inputRelationsPath) - .repartition(relPartitions) - .filter(rel -> !rel.getDataInfo().getDeletedbyinference()) - .filter(rel -> !relationFilter.contains(rel.getRelClass())) - // group by SOURCE and apply limit - .mapToPair(rel -> new Tuple2<>(rel.getSource(), rel)) - .groupByKey(new RelationPartitioner(relPartitions)) - .flatMap(group -> Iterables.limit(group._2(), maxRelations).iterator()) - // group by TARGET and apply limit - .mapToPair(rel -> new Tuple2<>(rel.getTarget(), rel)) - .groupByKey(new RelationPartitioner(relPartitions)) - .flatMap(group -> Iterables.limit(group._2(), maxRelations).iterator()) + // group by SOURCE and apply limit + RDD bySource = readPathRelationRDD(spark, inputRelationsPath) + .filter(rel -> rel.getDataInfo().getDeletedbyinference() == false) + .filter(rel -> relationFilter.contains(rel.getRelClass()) == false) + .mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, r.getSource()), r)) + .repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions)) + .groupBy(Tuple2::_1) + .map(Tuple2::_2) + .map(t -> Iterables.limit(t, maxRelations)) + .flatMap(Iterable::iterator) + .map(Tuple2::_2) .rdd(); spark - .createDataset(cappedRels, Encoders.bean(SortableRelation.class)) + .createDataset(bySource, Encoders.bean(Relation.class)) + .repartition(relPartitions) .write() .mode(SaveMode.Overwrite) .parquet(outputPath); } + private static void prepareRelationsDataset( + SparkSession spark, String inputRelationsPath, String outputPath, Set relationFilter, int maxRelations, + int relPartitions) { + spark + .read() + .textFile(inputRelationsPath) + .repartition(relPartitions) + .map( + (MapFunction) s -> OBJECT_MAPPER.readValue(s, Relation.class), + Encoders.kryo(Relation.class)) + .filter((FilterFunction) rel -> rel.getDataInfo().getDeletedbyinference() == false) + .filter((FilterFunction) rel -> relationFilter.contains(rel.getRelClass()) == false) + .groupByKey( + (MapFunction) Relation::getSource, + Encoders.STRING()) + .agg(new RelationAggregator(maxRelations).toColumn()) + .flatMap( + (FlatMapFunction, Relation>) t -> Iterables + .limit(t._2().getRelations(), maxRelations) + .iterator(), + Encoders.bean(Relation.class)) + .repartition(relPartitions) + .write() + .mode(SaveMode.Overwrite) + .parquet(outputPath); + } + + public static class RelationAggregator + extends Aggregator { + + private int maxRelations; + + public RelationAggregator(int maxRelations) { + this.maxRelations = maxRelations; + } + + @Override + public RelationList zero() { + return new RelationList(); + } + + @Override + public RelationList reduce(RelationList b, Relation a) { + b.getRelations().add(a); + return getSortableRelationList(b); + } + + @Override + public RelationList merge(RelationList b1, RelationList b2) { + b1.getRelations().addAll(b2.getRelations()); + return getSortableRelationList(b1); + } + + @Override + public RelationList finish(RelationList r) { + return getSortableRelationList(r); + } + + private RelationList getSortableRelationList(RelationList b1) { + RelationList sr = new RelationList(); + sr + .setRelations( + b1 + .getRelations() + .stream() + .limit(maxRelations) + .collect(Collectors.toCollection(() -> new PriorityQueue<>(new RelationComparator())))); + return sr; + } + + @Override + public Encoder bufferEncoder() { + return Encoders.kryo(RelationList.class); + } + + @Override + public Encoder outputEncoder() { + return Encoders.kryo(RelationList.class); + } + } + /** * Reads a JavaRDD of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text * file, @@ -162,10 +245,10 @@ public class PrepareRelationsJob { * @param inputPath * @return the JavaRDD containing all the relationships */ - private static JavaRDD readPathRelationRDD( + private static JavaRDD readPathRelationRDD( SparkSession spark, final String inputPath) { JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - return sc.textFile(inputPath).map(s -> OBJECT_MAPPER.readValue(s, SortableRelation.class)); + return sc.textFile(inputPath).map(s -> OBJECT_MAPPER.readValue(s, Relation.class)); } private static void removeOutputDir(SparkSession spark, String path) { diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/RelationComparator.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/RelationComparator.java new file mode 100644 index 000000000..f2209c26c --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/RelationComparator.java @@ -0,0 +1,43 @@ + +package eu.dnetlib.dhp.oa.provision; + +import java.util.Comparator; +import java.util.Map; +import java.util.Optional; + +import com.google.common.collect.ComparisonChain; +import com.google.common.collect.Maps; + +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class RelationComparator implements Comparator { + + private static final Map weights = Maps.newHashMap(); + + static { + weights.put("outcome", 0); + weights.put("supplement", 1); + weights.put("review", 2); + weights.put("citation", 3); + weights.put("affiliation", 4); + weights.put("relationship", 5); + weights.put("publicationDataset", 6); + weights.put("similarity", 7); + + weights.put("provision", 8); + weights.put("participation", 9); + weights.put("dedup", 10); + } + + private Integer getWeight(Relation o) { + return Optional.ofNullable(weights.get(o.getSubRelType())).orElse(Integer.MAX_VALUE); + } + + @Override + public int compare(Relation o1, Relation o2) { + return ComparisonChain + .start() + .compare(getWeight(o1), getWeight(o2)) + .result(); + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/RelationList.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/RelationList.java new file mode 100644 index 000000000..6e5fd7dba --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/RelationList.java @@ -0,0 +1,25 @@ + +package eu.dnetlib.dhp.oa.provision; + +import java.io.Serializable; +import java.util.PriorityQueue; +import java.util.Queue; + +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class RelationList implements Serializable { + + private Queue relations; + + public RelationList() { + this.relations = new PriorityQueue<>(new RelationComparator()); + } + + public Queue getRelations() { + return relations; + } + + public void setRelations(Queue relations) { + this.relations = relations; + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SortableRelation.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SortableRelation.java new file mode 100644 index 000000000..8ce92a6a0 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SortableRelation.java @@ -0,0 +1,80 @@ + +package eu.dnetlib.dhp.oa.provision; + +import java.io.Serializable; +import java.util.Map; +import java.util.Optional; + +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.google.common.collect.ComparisonChain; +import com.google.common.collect.Maps; + +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class SortableRelation extends Relation implements Comparable, Serializable { + + private static final Map weights = Maps.newHashMap(); + + static { + weights.put("outcome", 0); + weights.put("supplement", 1); + weights.put("review", 2); + weights.put("citation", 3); + weights.put("affiliation", 4); + weights.put("relationship", 5); + weights.put("publicationDataset", 6); + weights.put("similarity", 7); + + weights.put("provision", 8); + weights.put("participation", 9); + weights.put("dedup", 10); + } + + private static final long serialVersionUID = 34753984579L; + + private String groupingKey; + + public static SortableRelation create(Relation r, String groupingKey) { + SortableRelation sr = new SortableRelation(); + sr.setGroupingKey(groupingKey); + sr.setSource(r.getSource()); + sr.setTarget(r.getTarget()); + sr.setRelType(r.getRelType()); + sr.setSubRelType(r.getSubRelType()); + sr.setRelClass(r.getRelClass()); + sr.setDataInfo(r.getDataInfo()); + sr.setCollectedfrom(r.getCollectedfrom()); + sr.setLastupdatetimestamp(r.getLastupdatetimestamp()); + sr.setProperties(r.getProperties()); + sr.setValidated(r.getValidated()); + sr.setValidationDate(r.getValidationDate()); + + return sr; + } + + @JsonIgnore + public Relation asRelation() { + return this; + } + + @Override + public int compareTo(SortableRelation o) { + return ComparisonChain + .start() + .compare(getGroupingKey(), o.getGroupingKey()) + .compare(getWeight(this), getWeight(o)) + .result(); + } + + private Integer getWeight(SortableRelation o) { + return Optional.ofNullable(weights.get(o.getSubRelType())).orElse(Integer.MAX_VALUE); + } + + public String getGroupingKey() { + return groupingKey; + } + + public void setGroupingKey(String groupingKey) { + this.groupingKey = groupingKey; + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java index f9fde14e5..c09ed86e5 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java @@ -5,6 +5,8 @@ import java.util.List; import com.google.common.collect.Lists; +import eu.dnetlib.dhp.oa.provision.RelationList; +import eu.dnetlib.dhp.oa.provision.SortableRelation; import eu.dnetlib.dhp.schema.common.ModelSupport; public class ProvisionModelSupport { @@ -15,11 +17,12 @@ public class ProvisionModelSupport { .addAll( Lists .newArrayList( - TypedRow.class, RelatedEntityWrapper.class, JoinedEntity.class, RelatedEntity.class, - SortableRelation.class)); + SortableRelationKey.class, + SortableRelation.class, + RelationList.class)); return modelClasses.toArray(new Class[] {}); } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntityWrapper.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntityWrapper.java index d708b6ed0..4a4a4a5be 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntityWrapper.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntityWrapper.java @@ -5,28 +5,26 @@ import java.io.Serializable; import com.google.common.base.Objects; +import eu.dnetlib.dhp.schema.oaf.Relation; + public class RelatedEntityWrapper implements Serializable { - private SortableRelation relation; + private Relation relation; private RelatedEntity target; public RelatedEntityWrapper() { } - public RelatedEntityWrapper(SortableRelation relation, RelatedEntity target) { - this(null, relation, target); - } - - public RelatedEntityWrapper(TypedRow entity, SortableRelation relation, RelatedEntity target) { + public RelatedEntityWrapper(Relation relation, RelatedEntity target) { this.relation = relation; this.target = target; } - public SortableRelation getRelation() { + public Relation getRelation() { return relation; } - public void setRelation(SortableRelation relation) { + public void setRelation(Relation relation) { this.relation = relation; } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java deleted file mode 100644 index b6571b9bf..000000000 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java +++ /dev/null @@ -1,38 +0,0 @@ - -package eu.dnetlib.dhp.oa.provision.model; - -import java.io.Serializable; -import java.util.Map; - -import com.google.common.collect.ComparisonChain; -import com.google.common.collect.Maps; - -import eu.dnetlib.dhp.schema.oaf.Relation; - -public class SortableRelation extends Relation implements Comparable, Serializable { - - private static final Map weights = Maps.newHashMap(); - - static { - weights.put("outcome", 0); - weights.put("supplement", 1); - weights.put("affiliation", 2); - weights.put("relationship", 3); - weights.put("publicationDataset", 4); - weights.put("similarity", 5); - - weights.put("provision", 6); - weights.put("participation", 7); - weights.put("dedup", 8); - } - - @Override - public int compareTo(Relation o) { - return ComparisonChain - .start() - .compare(weights.get(getSubRelType()), weights.get(o.getSubRelType())) - .compare(getSource(), o.getSource()) - .compare(getTarget(), o.getTarget()) - .result(); - } -} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java new file mode 100644 index 000000000..bf7f9330d --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java @@ -0,0 +1,90 @@ + +package eu.dnetlib.dhp.oa.provision.model; + +import java.io.Serializable; +import java.util.Map; +import java.util.Optional; + +import com.google.common.base.Objects; +import com.google.common.collect.ComparisonChain; +import com.google.common.collect.Maps; + +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class SortableRelationKey implements Comparable, Serializable { + + private static final Map weights = Maps.newHashMap(); + + static { + weights.put("outcome", 0); + weights.put("supplement", 1); + weights.put("review", 2); + weights.put("citation", 3); + weights.put("affiliation", 4); + weights.put("relationship", 5); + weights.put("publicationDataset", 6); + weights.put("similarity", 7); + + weights.put("provision", 8); + weights.put("participation", 9); + weights.put("dedup", 10); + } + + private static final long serialVersionUID = 3232323; + + private String groupingKey; + + private String subRelType; + + public static SortableRelationKey create(Relation r, String groupingKey) { + SortableRelationKey sr = new SortableRelationKey(); + sr.setGroupingKey(groupingKey); + sr.setSubRelType(r.getSubRelType()); + return sr; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + SortableRelationKey that = (SortableRelationKey) o; + return getGroupingKey().equals(that.getGroupingKey()); + } + + @Override + public int hashCode() { + return Objects.hashCode(getGroupingKey()); + } + + @Override + public int compareTo(SortableRelationKey o) { + return ComparisonChain + .start() + .compare(getGroupingKey(), o.getGroupingKey()) + .compare(getWeight(this), getWeight(o)) + .result(); + } + + private Integer getWeight(SortableRelationKey o) { + return Optional.ofNullable(weights.get(o.getSubRelType())).orElse(Integer.MAX_VALUE); + } + + public String getSubRelType() { + return subRelType; + } + + public void setSubRelType(String subRelType) { + this.subRelType = subRelType; + } + + public String getGroupingKey() { + return groupingKey; + } + + public void setGroupingKey(String groupingKey) { + this.groupingKey = groupingKey; + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/TypedRow.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/TypedRow.java deleted file mode 100644 index cbec372e4..000000000 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/TypedRow.java +++ /dev/null @@ -1,64 +0,0 @@ - -package eu.dnetlib.dhp.oa.provision.model; - -import java.io.Serializable; - -import com.google.common.base.Objects; - -public class TypedRow implements Serializable { - - private String id; - - private Boolean deleted; - - private String type; - - private String oaf; - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public Boolean getDeleted() { - return deleted; - } - - public void setDeleted(Boolean deleted) { - this.deleted = deleted; - } - - public String getType() { - return type; - } - - public void setType(String type) { - this.type = type; - } - - public String getOaf() { - return oaf; - } - - public void setOaf(String oaf) { - this.oaf = oaf; - } - - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - TypedRow typedRow2 = (TypedRow) o; - return Objects.equal(id, typedRow2.id); - } - - @Override - public int hashCode() { - return Objects.hashCode(id); - } -} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java index c7862b48a..7bd8b9217 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java @@ -4,12 +4,16 @@ package eu.dnetlib.dhp.oa.provision.utils; import org.apache.spark.Partitioner; import org.apache.spark.util.Utils; +import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey; + /** * Used in combination with SortableRelationKey, allows to partition the records by source id, therefore allowing to * sort relations sharing the same source id by the ordering defined in SortableRelationKey. */ public class RelationPartitioner extends Partitioner { + private static final long serialVersionUID = 343434456L; + private final int numPartitions; public RelationPartitioner(int numPartitions) { @@ -23,8 +27,18 @@ public class RelationPartitioner extends Partitioner { @Override public int getPartition(Object key) { - String partitionKey = (String) key; - return Utils.nonNegativeMod(partitionKey.hashCode(), numPartitions()); + SortableRelationKey partitionKey = (SortableRelationKey) key; + return Utils.nonNegativeMod(partitionKey.getGroupingKey().hashCode(), numPartitions()); + } + + @Override + public boolean equals(Object obj) { + if (obj instanceof RelationPartitioner) { + RelationPartitioner p = (RelationPartitioner) obj; + if (p.numPartitions() == numPartitions()) + return true; + } + return false; } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java index b2aa01dc7..db9a68d3d 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java @@ -121,10 +121,6 @@ public class XmlRecordFactory implements Serializable { } } - private static OafEntity toOafEntity(TypedRow typedRow) { - return parseOaf(typedRow.getOaf(), typedRow.getType()); - } - private static OafEntity parseOaf(final String json, final String type) { try { switch (EntityType.valueOf(type)) { @@ -276,7 +272,7 @@ public class XmlRecordFactory implements Serializable { pidType, pidValue .toLowerCase() - .replaceAll("orcid", ""))); + .replaceAll("^.*orcid\\.org\\/", ""))); } } }); diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml index 0d5121cf1..697a00a09 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml @@ -133,6 +133,7 @@ --inputRelationsPath${inputGraphRootPath}/relation --outputPath${workingDir}/relation + --maxRelations${maxRelations} --relPartitions5000 diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJobTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJobTest.java new file mode 100644 index 000000000..c16bbc6fb --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJobTest.java @@ -0,0 +1,93 @@ + +package eu.dnetlib.dhp.oa.provision; + +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport; +import eu.dnetlib.dhp.schema.oaf.Relation; +import org.apache.commons.io.FileUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +public class PrepareRelationsJobTest { + + private static final Logger log = LoggerFactory.getLogger(PrepareRelationsJobTest.class); + + public static final String SUBRELTYPE = "subRelType"; + public static final String OUTCOME = "outcome"; + public static final String SUPPLEMENT = "supplement"; + + private static SparkSession spark; + + private static Path workingDir; + + @BeforeAll + public static void setUp() throws IOException { + workingDir = Files.createTempDirectory(PrepareRelationsJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ProvisionModelSupport.getModelClasses()); + + spark = SparkSession + .builder() + .appName(PrepareRelationsJobTest.class.getSimpleName()) + .master("local[*]") + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + @Test + public void testRunPrepareRelationsJob(@TempDir Path testPath) throws Exception { + + final int maxRelations = 10; + PrepareRelationsJob + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-inputRelationsPath", getClass().getResource("relations.gz").getPath(), + "-outputPath", testPath.toString(), + "-relPartitions", "10", + "-relationFilter", "asd", + "-maxRelations", String.valueOf(maxRelations) + }); + + Dataset out = spark.read() + .parquet(testPath.toString()) + .as(Encoders.bean(Relation.class)) + .cache(); + + Assertions.assertEquals(10, out.count()); + + Dataset freq = out.toDF().cube(SUBRELTYPE).count().filter((FilterFunction) value -> !value.isNullAt(0)); + long outcome = freq.filter(freq.col(SUBRELTYPE).equalTo(OUTCOME)).collectAsList().get(0).getAs("count"); + long supplement = freq.filter(freq.col(SUBRELTYPE).equalTo(SUPPLEMENT)).collectAsList().get(0).getAs("count"); + + Assertions.assertTrue(outcome > supplement); + Assertions.assertEquals(7, outcome); + Assertions.assertEquals(3, supplement); + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SortableRelationKeyTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SortableRelationKeyTest.java new file mode 100644 index 000000000..72f28fdf2 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SortableRelationKeyTest.java @@ -0,0 +1,42 @@ + +package eu.dnetlib.dhp.oa.provision; + +import java.io.IOException; +import java.util.List; + +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.Test; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey; +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class SortableRelationKeyTest { + + @Test + public void doTesSorting() throws IOException { + final ObjectMapper mapper = new ObjectMapper(); + final String json = IOUtils.toString(this.getClass().getResourceAsStream("relations.json")); + final List relations = mapper.readValue(json, new TypeReference>() { + }); + + relations + .stream() + .map(r -> SortableRelationKey.create(r, r.getSource())) + .sorted() + .forEach( + + it -> { + try { + System.out.println(mapper.writeValueAsString(it)); + } catch (JsonProcessingException e) { + e.printStackTrace(); + } + }); + + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/relations.gz b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/relations.gz new file mode 100644 index 000000000..13bc01c8c Binary files /dev/null and b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/relations.gz differ diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/relations.json b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/relations.json new file mode 100644 index 000000000..3280d0d61 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/relations.json @@ -0,0 +1,90 @@ +[ + { + "collectedfrom": [], + "dataInfo": { + "deletedbyinference": false, + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "Harvested", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "lastupdatetimestamp": 1592688952862, + "properties": [], + "relClass": "hasAuthorInstitution", + "relType": "resultOrganization", + "source": "1", + "subRelType": "affiliation", + "target": "2" + }, + { + "collectedfrom": [], + "dataInfo": { + "deletedbyinference": false, + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "Harvested", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "lastupdatetimestamp": 1592688952862, + "properties": [], + "relClass": "isAuthorInstitutionOf", + "relType": "resultOrganization", + "source": "2", + "subRelType": "affiliation", + "target": "1" + }, + { + "collectedfrom": [], + "dataInfo": { + "deletedbyinference": false, + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "Harvested", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "lastupdatetimestamp": 1592688952862, + "properties": [], + "relClass": "isProducedBy", + "relType": "resultProject", + "source": "1", + "subRelType": "outcome", + "target": "2" + }, + { + "collectedfrom": [], + "dataInfo": { + "deletedbyinference": false, + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:entityregistry", + "classname": "Harvested", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "lastupdatetimestamp": 1592688952862, + "properties": [], + "relClass": "produces", + "relType": "resultProject", + "source": "2", + "subRelType": "outcome", + "target": "1" + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/log4j.properties b/dhp-workflows/dhp-graph-provision/src/test/resources/log4j.properties new file mode 100644 index 000000000..20f56e38d --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/log4j.properties @@ -0,0 +1,11 @@ +# Set root logger level to DEBUG and its only appender to A1. +log4j.rootLogger=INFO, A1 + +# A1 is set to be a ConsoleAppender. +log4j.appender.A1=org.apache.log4j.ConsoleAppender + +# A1 uses PatternLayout. +log4j.logger.org = ERROR +log4j.logger.eu.dnetlib = DEBUG +log4j.appender.A1.layout=org.apache.log4j.PatternLayout +log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n \ No newline at end of file