diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java index a3c1610db9..c5905e45b3 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java @@ -1,6 +1,10 @@ package eu.dnetlib.dhp.schema.common; +import java.security.Key; + +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Qualifier; public class ModelConstants { @@ -95,6 +99,9 @@ public class ModelConstants { SYSIMPORT_CROSSWALK_ENTITYREGISTRY, SYSIMPORT_CROSSWALK_ENTITYREGISTRY, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS); + public static final KeyValue UNKNOWN_REPOSITORY = keyValue( + "10|openaire____::55045bd2a65019fd8e6741a755395c8c", "Unknown Repository"); + private static Qualifier qualifier( final String classid, final String classname, @@ -107,4 +114,12 @@ public class ModelConstants { q.setSchemename(schemename); return q; } + + private static KeyValue keyValue(String key, String value) { + KeyValue kv = new KeyValue(); + kv.setKey(key); + kv.setValue(value); + kv.setDataInfo(new DataInfo()); + return kv; + } } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java index 17bfc4af36..5fa9e67235 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java @@ -4,6 +4,7 @@ package eu.dnetlib.dhp.actionmanager.promote; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; +import java.io.IOException; import java.util.Objects; import java.util.Optional; import java.util.function.BiFunction; @@ -20,6 +21,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.exc.UnrecognizedPropertyException; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; @@ -134,24 +136,39 @@ public class PromoteActionPayloadForGraphTableJob { .map( (MapFunction) value -> OBJECT_MAPPER.readValue(value, rowClazz), Encoders.bean(rowClazz)); - - /* - * return spark .read() .parquet(path) .as(Encoders.bean(rowClazz)); - */ } private static Dataset readActionPayload( SparkSession spark, String path, Class actionPayloadClazz) { logger.info("Reading action payload from path: {}", path); + return spark .read() .parquet(path) + .map((MapFunction) value -> extractPayload(value), Encoders.STRING()) .map( - (MapFunction) value -> OBJECT_MAPPER - .readValue(value. getAs("payload"), actionPayloadClazz), + (MapFunction) value -> decodePayload(actionPayloadClazz, value), Encoders.bean(actionPayloadClazz)); } + private static String extractPayload(Row value) { + try { + return value. getAs("payload"); + } catch (IllegalArgumentException | ClassCastException e) { + logger.error("cannot extract payload from action: {}", value.toString()); + throw e; + } + } + + private static A decodePayload(Class actionPayloadClazz, String payload) throws IOException { + try { + return OBJECT_MAPPER.readValue(payload, actionPayloadClazz); + } catch (UnrecognizedPropertyException e) { + logger.error("error decoding payload: {}", payload); + throw e; + } + } + private static Dataset promoteActionPayloadForGraphTable( Dataset rowDS, Dataset actionPayloadDS, diff --git a/dhp-workflows/dhp-broker-events/pom.xml b/dhp-workflows/dhp-broker-events/pom.xml index eddc042c6b..8d7d3b88c5 100644 --- a/dhp-workflows/dhp-broker-events/pom.xml +++ b/dhp-workflows/dhp-broker-events/pom.xml @@ -24,11 +24,7 @@ org.apache.spark spark-sql_2.11 - - org.apache.spark - spark-hive_2.11 - test - + eu.dnetlib.dhp @@ -45,10 +41,6 @@ dnet-pace-core - - com.jayway.jsonpath - json-path - dom4j dom4j @@ -61,7 +53,7 @@ eu.dnetlib dnet-openaire-broker-common - [3.0.1,4.0.0) + [3.0.3,4.0.0) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java index bf4f62d243..6e38f74489 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java @@ -11,7 +11,7 @@ import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.time.DateUtils; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; public class EventFactory { @@ -49,8 +49,8 @@ public class EventFactory { private static Map createMapFromResult(final UpdateInfo updateInfo) { final Map map = new HashMap<>(); - final OpenaireBrokerResult source = updateInfo.getSource(); - final OpenaireBrokerResult target = updateInfo.getTarget(); + final OaBrokerMainEntity source = updateInfo.getSource(); + final OaBrokerMainEntity target = updateInfo.getTarget(); map.put("target_datasource_id", target.getCollectedFromId()); map.put("target_datasource_name", target.getCollectedFromName()); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java index c1f12f43c5..db59920106 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java @@ -18,25 +18,20 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.broker.model.Event; import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.EventFinder; import eu.dnetlib.dhp.broker.oa.util.EventGroup; import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultAggregator; import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.OpenaireBrokerResultAggregator; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedEntityFactory; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; -import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.OaBrokerMainEntityAggregator; import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; -import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.pace.config.DedupConfig; @@ -46,8 +41,6 @@ public class GenerateEventsApplication { private static final Logger log = LoggerFactory.getLogger(GenerateEventsApplication.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils @@ -75,126 +68,108 @@ public class GenerateEventsApplication { log.info("dedupConfigProfileId: {}", dedupConfigProfileId); final SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(BrokerConstants.getModelClasses()); + // conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + // conf.registerKryoClasses(BrokerConstants.getModelClasses()); - final DedupConfig dedupConfig = loadDedupConfig(isLookupUrl, dedupConfigProfileId); + // TODO UNCOMMENT + // final DedupConfig dedupConfig = loadDedupConfig(isLookupUrl, dedupConfigProfileId); + final DedupConfig dedupConfig = null; runWithSparkSession(conf, isSparkSessionManaged, spark -> { - removeOutputDir(spark, eventsPath); + ClusterUtils.removeDir(spark, eventsPath); - spark - .emptyDataset(Encoders.kryo(Event.class)) - .union(generateEvents(spark, graphPath, Publication.class, dedupConfig)) - .union(generateEvents(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class, dedupConfig)) - .union(generateEvents(spark, graphPath, Software.class, dedupConfig)) - .union(generateEvents(spark, graphPath, OtherResearchProduct.class, dedupConfig)) + // TODO REMOVE THIS + + expandResultsWithRelations(spark, graphPath, Publication.class) .write() .mode(SaveMode.Overwrite) - .option("compression", "gzip") .json(eventsPath); + + // TODO UNCOMMENT THIS + // spark + // .emptyDataset(Encoders.bean(Event.class)) + // .union(generateEvents(spark, graphPath, Publication.class, dedupConfig)) + // .union(generateEvents(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class, dedupConfig)) + // .union(generateEvents(spark, graphPath, Software.class, dedupConfig)) + // .union(generateEvents(spark, graphPath, OtherResearchProduct.class, dedupConfig)) + // .write() + // .mode(SaveMode.Overwrite) + // .option("compression", "gzip") + // .json(eventsPath); }); } - private static void removeOutputDir(final SparkSession spark, final String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } - private static Dataset generateEvents( final SparkSession spark, final String graphPath, final Class sourceClass, final DedupConfig dedupConfig) { - final Dataset results = expandResultsWithRelations(spark, graphPath, sourceClass); + final Dataset results = expandResultsWithRelations(spark, graphPath, sourceClass); - final Dataset mergedRels = readPath(spark, graphPath + "/relation", Relation.class) + final Dataset mergedRels = ClusterUtils + .readPath(spark, graphPath + "/relation", Relation.class) .filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)); - final TypedColumn, ResultGroup> aggr = new ResultAggregator() + final TypedColumn, ResultGroup> aggr = new ResultAggregator() .toColumn(); return results - .joinWith(mergedRels, results.col("result.id").equalTo(mergedRels.col("source")), "inner") + .joinWith(mergedRels, results.col("openaireId").equalTo(mergedRels.col("source")), "inner") .groupByKey( - (MapFunction, String>) t -> t._2.getTarget(), Encoders.STRING()) + (MapFunction, String>) t -> t._2.getTarget(), Encoders.STRING()) .agg(aggr) - .map((MapFunction, ResultGroup>) t -> t._2, Encoders.kryo(ResultGroup.class)) - .filter(ResultGroup::isValid) + .map((MapFunction, ResultGroup>) t -> t._2, Encoders.bean(ResultGroup.class)) + .filter(rg -> rg.getData().size() > 1) .map( (MapFunction) g -> EventFinder.generateEvents(g, dedupConfig), - Encoders.kryo(EventGroup.class)) - .flatMap(group -> group.getData().iterator(), Encoders.kryo(Event.class)); + Encoders.bean(EventGroup.class)) + .flatMap(group -> group.getData().iterator(), Encoders.bean(Event.class)); } - private static Dataset expandResultsWithRelations( + private static Dataset expandResultsWithRelations( final SparkSession spark, final String graphPath, final Class sourceClass) { - final Dataset projects = readPath(spark, graphPath + "/project", Project.class); - final Dataset datasets = readPath( - spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class); - final Dataset softwares = readPath(spark, graphPath + "/software", Software.class); - final Dataset publications = readPath(spark, graphPath + "/publication", Publication.class); - final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class) - .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) - .cache(); + // final Dataset datasets = readPath( + // spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class); + // final Dataset softwares = readPath(spark, graphPath + "/software", Software.class); + // final Dataset publications = readPath(spark, graphPath + "/publication", Publication.class); - final Dataset r0 = readPath( - spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), Result.class) - .filter(r -> r.getDataInfo().getDeletedbyinference()) - .map(ConversionUtils::oafResultToBrokerResult, Encoders.kryo(OpenaireBrokerResult.class)); + final Dataset r0 = ClusterUtils + .readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass) + .filter(r -> r.getDataInfo().getDeletedbyinference()) + .map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OaBrokerMainEntity.class)); - final Dataset r1 = join(r0, rels, relatedEntities(projects, rels, RelatedProject.class)); - final Dataset r2 = join(r1, rels, relatedEntities(softwares, rels, RelatedProject.class)); - final Dataset r3 = join(r2, rels, relatedEntities(datasets, rels, RelatedProject.class)); - final Dataset r4 = join( - r3, rels, relatedEntities(publications, rels, RelatedProject.class)); - ; + // TODO UNCOMMENT THIS + // final Dataset r1 = join(r0, relatedProjects(spark, graphPath)); + // final Dataset r2 = join(r1, relatedDataset(spark, graphPath)); + // final Dataset r3 = join(r2, relatedPublications(spark, graphPath)); + // final Dataset r4 = join(r3, relatedSoftwares(spark, graphPath)); - return r4; + return r0; // TODO it should be r4 } - private static Dataset relatedEntities(final Dataset targets, - final Dataset rels, - final Class clazz) { - return rels - .joinWith(targets, targets.col("id").equalTo(rels.col("target")), "inner") - .map( - t -> RelatedEntityFactory.newRelatedEntity(t._1.getSource(), t._1.getRelType(), t._2, clazz), - Encoders.kryo(clazz)); - } - - private static Dataset join(final Dataset sources, - final Dataset rels, + private static Dataset join(final Dataset sources, final Dataset typedRels) { - final TypedColumn, OpenaireBrokerResult> aggr = new OpenaireBrokerResultAggregator() + final TypedColumn, OaBrokerMainEntity> aggr = new OaBrokerMainEntityAggregator() .toColumn(); - ; return sources - .joinWith(typedRels, sources.col("result.id").equalTo(rels.col("source")), "left_outer") + .joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer") .groupByKey( - (MapFunction, String>) t -> t._1.getOpenaireId(), Encoders.STRING()) + (MapFunction, String>) t -> t._1.getOpenaireId(), Encoders.STRING()) .agg(aggr) - .map(t -> t._2, Encoders.kryo(OpenaireBrokerResult.class)); - } + .map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class)); - public static Dataset readPath( - final SparkSession spark, - final String inputPath, - final Class clazz) { - return spark - .read() - .textFile(inputPath) - .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); } private static DedupConfig loadDedupConfig(final String isLookupUrl, final String profId) throws Exception { + final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl); final String conf = isLookUpService diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedDatasets.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedDatasets.java new file mode 100644 index 0000000000..4a10fbabf1 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedDatasets.java @@ -0,0 +1,73 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDataset; +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class GenerateRelatedDatasets { + + private static final Logger log = LoggerFactory.getLogger(GenerateRelatedDatasets.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GenerateRelatedDatasets.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_relations.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String graphPath = parser.get("graphPath"); + log.info("graphPath: {}", graphPath); + + final String relsPath = parser.get("relsPath"); + log.info("relsPath: {}", relsPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, relsPath); + + final Dataset datasets = ClusterUtils + .readPath(spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class); + + final Dataset rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class); + + rels + .joinWith(datasets, datasets.col("id").equalTo(rels.col("target")), "inner") + .map( + t -> new RelatedDataset( + t._1.getSource(), + t._1.getRelType(), + ConversionUtils.oafDatasetToBrokerDataset(t._2)), + Encoders.bean(RelatedDataset.class)) + .write() + .mode(SaveMode.Overwrite) + .json(relsPath); + + }); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedProjects.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedProjects.java new file mode 100644 index 0000000000..59ed388e7c --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedProjects.java @@ -0,0 +1,79 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class GenerateRelatedProjects { + + private static final Logger log = LoggerFactory.getLogger(GenerateRelatedProjects.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GenerateRelatedProjects.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_relations.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String graphPath = parser.get("graphPath"); + log.info("graphPath: {}", graphPath); + + final String relsPath = parser.get("relsPath"); + log.info("relsPath: {}", relsPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, relsPath); + + final Dataset projects = ClusterUtils.readPath(spark, graphPath + "/project", Project.class); + + final Dataset rels = ClusterUtils + .readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT)); + + rels + .joinWith(projects, projects.col("id").equalTo(rels.col("target")), "inner") + .map( + t -> new RelatedProject( + t._1.getSource(), + t._1.getRelType(), + ConversionUtils.oafProjectToBrokerProject(t._2)), + Encoders.bean(RelatedProject.class)) + .write() + .mode(SaveMode.Overwrite) + .json(relsPath); + }); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedPublications.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedPublications.java new file mode 100644 index 0000000000..0c20081dc8 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedPublications.java @@ -0,0 +1,78 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublication; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class GenerateRelatedPublications { + + private static final Logger log = LoggerFactory.getLogger(GenerateRelatedPublications.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GenerateRelatedPublications.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_relations.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String graphPath = parser.get("graphPath"); + log.info("graphPath: {}", graphPath); + + final String relsPath = parser.get("relsPath"); + log.info("relsPath: {}", relsPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, relsPath); + + final Dataset pubs = ClusterUtils + .readPath(spark, graphPath + "/publication", Publication.class); + + final Dataset rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class); + + rels + .joinWith(pubs, pubs.col("id").equalTo(rels.col("target")), "inner") + .map( + t -> new RelatedPublication( + t._1.getSource(), + t._1.getRelType(), + ConversionUtils.oafPublicationToBrokerPublication(t._2)), + Encoders.bean(RelatedPublication.class)) + .write() + .mode(SaveMode.Overwrite) + .json(relsPath); + + }); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedSoftwares.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedSoftwares.java new file mode 100644 index 0000000000..b957888463 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateRelatedSoftwares.java @@ -0,0 +1,76 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Software; + +public class GenerateRelatedSoftwares { + + private static final Logger log = LoggerFactory.getLogger(GenerateRelatedSoftwares.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GenerateRelatedSoftwares.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_relations.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String graphPath = parser.get("graphPath"); + log.info("graphPath: {}", graphPath); + + final String relsPath = parser.get("relsPath"); + log.info("relsPath: {}", relsPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, relsPath); + final Dataset softwares = ClusterUtils.readPath(spark, graphPath + "/software", Software.class); + + final Dataset rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class); + + rels + .joinWith(softwares, softwares.col("id").equalTo(rels.col("target")), "inner") + .map( + t -> new RelatedSoftware( + t._1.getSource(), + t._1.getRelType(), + ConversionUtils.oafSoftwareToBrokerSoftware(t._2)), + Encoders.bean(RelatedSoftware.class)) + .write() + .mode(SaveMode.Overwrite) + .json(relsPath); + + }); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateSimpleEntitities.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateSimpleEntitities.java new file mode 100644 index 0000000000..59485d5cf0 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateSimpleEntitities.java @@ -0,0 +1,85 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class GenerateSimpleEntitities { + + private static final Logger log = LoggerFactory.getLogger(GenerateSimpleEntitities.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GenerateSimpleEntitities.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_simple_entities.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String graphPath = parser.get("graphPath"); + log.info("graphPath: {}", graphPath); + + final String simpleEntitiesPath = parser.get("simpleEntitiesPath"); + log.info("simpleEntitiesPath: {}", simpleEntitiesPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, simpleEntitiesPath); + + expandResultsWithRelations(spark, graphPath, Publication.class) + .write() + .mode(SaveMode.Overwrite) + .json(simpleEntitiesPath); + + // TODO UNCOMMENT THIS + // spark + // .emptyDataset(Encoders.bean(Event.class)) + // .union(generateEvents(spark, graphPath, Publication.class, dedupConfig)) + // .union(generateEvents(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class, dedupConfig)) + // .union(generateEvents(spark, graphPath, Software.class, dedupConfig)) + // .union(generateEvents(spark, graphPath, OtherResearchProduct.class, dedupConfig)) + // .write() + // .mode(SaveMode.Overwrite) + // .option("compression", "gzip") + // .json(eventsPath); + }); + + } + + private static Dataset expandResultsWithRelations( + final SparkSession spark, + final String graphPath, + final Class sourceClass) { + + return ClusterUtils + .readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass) + .filter(r -> r.getDataInfo().getDeletedbyinference()) + .map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OaBrokerMainEntity.class)); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java index 13aeefb2fc..9aa6f53847 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java @@ -12,7 +12,7 @@ import java.util.function.Function; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.pace.config.DedupConfig; @@ -21,11 +21,11 @@ public abstract class UpdateMatcher { private final boolean multipleUpdate; private final Function topicFunction; - private final BiConsumer compileHighlightFunction; + private final BiConsumer compileHighlightFunction; private final Function highlightToStringFunction; public UpdateMatcher(final boolean multipleUpdate, final Function topicFunction, - final BiConsumer compileHighlightFunction, + final BiConsumer compileHighlightFunction, final Function highlightToStringFunction) { this.multipleUpdate = multipleUpdate; this.topicFunction = topicFunction; @@ -33,13 +33,13 @@ public abstract class UpdateMatcher { this.highlightToStringFunction = highlightToStringFunction; } - public Collection> searchUpdatesForRecord(final OpenaireBrokerResult res, - final Collection others, + public Collection> searchUpdatesForRecord(final OaBrokerMainEntity res, + final Collection others, final DedupConfig dedupConfig) { final Map> infoMap = new HashMap<>(); - for (final OpenaireBrokerResult source : others) { + for (final OaBrokerMainEntity source : others) { if (source != res) { for (final T hl : findDifferences(source, res)) { final Topic topic = getTopicFunction().apply(hl); @@ -68,7 +68,7 @@ public abstract class UpdateMatcher { } } - protected abstract List findDifferences(OpenaireBrokerResult source, OpenaireBrokerResult target); + protected abstract List findDifferences(OaBrokerMainEntity source, OaBrokerMainEntity target); protected static boolean isMissing(final List list) { return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0)); @@ -86,7 +86,7 @@ public abstract class UpdateMatcher { return topicFunction; } - public BiConsumer getCompileHighlightFunction() { + public BiConsumer getCompileHighlightFunction() { return compileHighlightFunction; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java index 7a58f986ba..c197734a3b 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java @@ -5,13 +5,12 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.Dataset; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerRelatedDataset; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public abstract class AbstractEnrichMissingDataset - extends UpdateMatcher { +public abstract class AbstractEnrichMissingDataset extends UpdateMatcher { public AbstractEnrichMissingDataset(final Topic topic) { super(true, @@ -23,14 +22,14 @@ public abstract class AbstractEnrichMissingDataset protected abstract boolean filterByType(String relType); @Override - protected final List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected final List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingDatasets = target .getDatasets() .stream() .filter(rel -> filterByType(rel.getRelType())) - .map(Dataset::getOriginalId) + .map(OaBrokerRelatedDataset::getOriginalId) .collect(Collectors.toSet()); return source diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java index fa5fde7251..49c546bbaf 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java @@ -4,12 +4,12 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedProjects; import java.util.ArrayList; import java.util.List; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.Project; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerProject; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMissingProject extends UpdateMatcher { +public class EnrichMissingProject extends UpdateMatcher { public EnrichMissingProject() { super(true, @@ -19,7 +19,7 @@ public class EnrichMissingProject extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) { if (target.getProjects().isEmpty()) { return source.getProjects(); } else { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java index ca63aeb49f..6954a3fb5e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java @@ -5,12 +5,12 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.Project; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerProject; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMoreProject extends UpdateMatcher { +public class EnrichMoreProject extends UpdateMatcher { public EnrichMoreProject() { super(true, @@ -19,13 +19,13 @@ public class EnrichMoreProject extends UpdateMatcher { prj -> projectAsString(prj)); } - private static String projectAsString(final Project prj) { + private static String projectAsString(final OaBrokerProject prj) { return prj.getFunder() + "::" + prj.getFundingProgram() + "::" + prj.getCode(); } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingProjects = target .getProjects() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java index 3008639493..ad6d8263b8 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java @@ -5,12 +5,12 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.Publication; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerRelatedPublication; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public abstract class AbstractEnrichMissingPublication extends UpdateMatcher { +public abstract class AbstractEnrichMissingPublication extends UpdateMatcher { public AbstractEnrichMissingPublication(final Topic topic) { super(true, @@ -23,15 +23,15 @@ public abstract class AbstractEnrichMissingPublication extends UpdateMatcher findDifferences( - final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected final List findDifferences( + final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingPublications = target .getPublications() .stream() .filter(rel -> filterByType(rel.getRelType())) - .map(Publication::getOriginalId) + .map(OaBrokerRelatedPublication::getOriginalId) .collect(Collectors.toSet()); return source diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java index 76ae061e67..452caa5033 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java @@ -4,12 +4,13 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware; import java.util.ArrayList; import java.util.List; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; public class EnrichMissingSoftware - extends UpdateMatcher { + extends UpdateMatcher { public EnrichMissingSoftware() { super(true, @@ -19,9 +20,9 @@ public class EnrichMissingSoftware } @Override - protected List findDifferences( - final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences( + final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { if (target.getSoftwares().isEmpty()) { return source.getSoftwares(); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java index ebd421b8e7..aaffe12498 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java @@ -5,12 +5,12 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.Software; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMoreSoftware extends UpdateMatcher { +public class EnrichMoreSoftware extends UpdateMatcher { public EnrichMoreSoftware() { super(true, @@ -20,14 +20,14 @@ public class EnrichMoreSoftware extends UpdateMatcher { } @Override - protected List findDifferences( - final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences( + final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingSoftwares = source .getSoftwares() .stream() - .map(Software::getName) + .map(OaBrokerRelatedSoftware::getName) .collect(Collectors.toSet()); return target diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java index b2cbbce2c5..73462bae8f 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java @@ -5,7 +5,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; @@ -19,7 +19,7 @@ public class EnrichMissingAbstract extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) { if (isMissing(target.getAbstracts()) && !isMissing(source.getAbstracts())) { return Arrays.asList(source.getAbstracts().get(0)); } else { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java index c4b96e67bf..2a01188a9d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java @@ -7,12 +7,12 @@ import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; -import eu.dnetlib.broker.objects.Author; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerAuthor; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMissingAuthorOrcid extends UpdateMatcher { +public class EnrichMissingAuthorOrcid extends UpdateMatcher { public EnrichMissingAuthorOrcid() { super(true, @@ -22,13 +22,13 @@ public class EnrichMissingAuthorOrcid extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingOrcids = target .getCreators() .stream() - .map(Author::getOrcid) + .map(OaBrokerAuthor::getOrcid) .filter(StringUtils::isNotBlank) .collect(Collectors.toSet()); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java index e870cf1faa..4873829570 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java @@ -5,28 +5,28 @@ import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.Instance; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerInstance; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; -public class EnrichMissingOpenAccess extends UpdateMatcher { +public class EnrichMissingOpenAccess extends UpdateMatcher { public EnrichMissingOpenAccess() { super(true, i -> Topic.ENRICH_MISSING_OA_VERSION, (p, i) -> p.getInstances().add(i), - Instance::getUrl); + OaBrokerInstance::getUrl); } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final long count = target .getInstances() .stream() - .map(Instance::getLicense) + .map(OaBrokerInstance::getLicense) .filter(right -> right.equals(BrokerConstants.OPEN_ACCESS)) .count(); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java index cc72d9fa9a..ee1617b1e0 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java @@ -5,12 +5,12 @@ import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.TypedValue; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerTypedValue; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMissingPid extends UpdateMatcher { +public class EnrichMissingPid extends UpdateMatcher { public EnrichMissingPid() { super(true, @@ -20,8 +20,8 @@ public class EnrichMissingPid extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final long count = target.getPids().size(); if (count > 0) { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java index ed8c26b5ab..2c0533fa35 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java @@ -5,7 +5,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; @@ -19,8 +19,8 @@ public class EnrichMissingPublicationDate extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { if (isMissing(target.getPublicationdate()) && !isMissing(source.getPublicationdate())) { return Arrays.asList(source.getPublicationdate()); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java index 07b1fa41a9..9ab9fce488 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java @@ -5,12 +5,12 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.TypedValue; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerTypedValue; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMissingSubject extends UpdateMatcher { +public class EnrichMissingSubject extends UpdateMatcher { public EnrichMissingSubject() { super(true, @@ -20,8 +20,8 @@ public class EnrichMissingSubject extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingSubject = target .getSubjects() .stream() @@ -35,7 +35,7 @@ public class EnrichMissingSubject extends UpdateMatcher { .collect(Collectors.toList()); } - private static String subjectAsString(final TypedValue s) { + private static String subjectAsString(final OaBrokerTypedValue s) { return s.getType() + "::" + s.getValue(); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java index bfef3ee4f6..e90a8f2012 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java @@ -5,24 +5,24 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.Instance; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerInstance; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; -public class EnrichMoreOpenAccess extends UpdateMatcher { +public class EnrichMoreOpenAccess extends UpdateMatcher { public EnrichMoreOpenAccess() { super(true, i -> Topic.ENRICH_MORE_OA_VERSION, (p, i) -> p.getInstances().add(i), - Instance::getUrl); + OaBrokerInstance::getUrl); } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set urls = target .getInstances() .stream() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java index d1f2e60225..43b4f0628d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java @@ -5,12 +5,12 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.TypedValue; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerTypedValue; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMorePid extends UpdateMatcher { +public class EnrichMorePid extends UpdateMatcher { public EnrichMorePid() { super(true, @@ -20,8 +20,8 @@ public class EnrichMorePid extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingPids = target .getPids() .stream() @@ -35,7 +35,7 @@ public class EnrichMorePid extends UpdateMatcher { .collect(Collectors.toList()); } - private static String pidAsString(final TypedValue pid) { + private static String pidAsString(final OaBrokerTypedValue pid) { return pid.getType() + "::" + pid.getValue(); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java index 39225e8ab8..04fb494ef9 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java @@ -5,12 +5,12 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.TypedValue; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerTypedValue; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMoreSubject extends UpdateMatcher { +public class EnrichMoreSubject extends UpdateMatcher { public EnrichMoreSubject() { super(true, @@ -20,8 +20,8 @@ public class EnrichMoreSubject extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingSubjects = target .getSubjects() .stream() @@ -35,7 +35,7 @@ public class EnrichMoreSubject extends UpdateMatcher { .collect(Collectors.toList()); } - private static String subjectAsString(final TypedValue s) { + private static String subjectAsString(final OaBrokerTypedValue s) { return s.getType() + "::" + s.getValue(); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java new file mode 100644 index 0000000000..8bcea5e6e3 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java @@ -0,0 +1,31 @@ + +package eu.dnetlib.dhp.broker.oa.util; + +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.common.HdfsSupport; + +public class ClusterUtils { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void removeDir(final SparkSession spark, final String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } + + public static Dataset readPath( + final SparkSession spark, + final String inputPath, + final Class clazz) { + return spark + .read() + .textFile(inputPath) + .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java index d04ef45a0e..b61d5e7cc7 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java @@ -3,18 +3,28 @@ package eu.dnetlib.dhp.broker.oa.util; import java.util.ArrayList; import java.util.List; +import java.util.Objects; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; -import org.apache.commons.lang3.tuple.Pair; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.DocumentHelper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.TypedValue; +import com.google.common.base.Function; + +import eu.dnetlib.broker.objects.OaBrokerAuthor; +import eu.dnetlib.broker.objects.OaBrokerExternalReference; +import eu.dnetlib.broker.objects.OaBrokerInstance; +import eu.dnetlib.broker.objects.OaBrokerJournal; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerProject; +import eu.dnetlib.broker.objects.OaBrokerRelatedDataset; +import eu.dnetlib.broker.objects.OaBrokerRelatedPublication; +import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware; +import eu.dnetlib.broker.objects.OaBrokerTypedValue; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.ExternalReference; @@ -24,6 +34,7 @@ import eu.dnetlib.dhp.schema.oaf.Journal; import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; @@ -32,145 +43,148 @@ public class ConversionUtils { private static final Logger log = LoggerFactory.getLogger(ConversionUtils.class); - public static List oafInstanceToBrokerInstances(final Instance i) { - return i.getUrl().stream().map(url -> { - return new eu.dnetlib.broker.objects.Instance() - .setUrl(url) - .setInstancetype(i.getInstancetype().getClassid()) - .setLicense(BrokerConstants.OPEN_ACCESS) - .setHostedby(i.getHostedby().getValue()); - }).collect(Collectors.toList()); + public static List oafInstanceToBrokerInstances(final Instance i) { + if (i == null) { + return new ArrayList<>(); + } + + return mappedList(i.getUrl(), url -> { + final OaBrokerInstance res = new OaBrokerInstance(); + res.setUrl(url); + res.setInstancetype(classId(i.getInstancetype())); + res.setLicense(BrokerConstants.OPEN_ACCESS); + res.setHostedby(kvValue(i.getHostedby())); + return res; + }); } - public static TypedValue oafPidToBrokerPid(final StructuredProperty sp) { - return sp != null ? new TypedValue() - .setValue(sp.getValue()) - .setType(sp.getQualifier().getClassid()) : null; + public static OaBrokerTypedValue oafPidToBrokerPid(final StructuredProperty sp) { + return oafStructPropToBrokerTypedValue(sp); } - public static final Pair oafSubjectToPair(final StructuredProperty sp) { - return sp != null ? Pair.of(sp.getQualifier().getClassid(), sp.getValue()) : null; + public static OaBrokerTypedValue oafStructPropToBrokerTypedValue(final StructuredProperty sp) { + return sp != null ? new OaBrokerTypedValue(classId(sp.getQualifier()), sp.getValue()) : null; } - public static final eu.dnetlib.broker.objects.Dataset oafDatasetToBrokerDataset(final Dataset d) { - return d != null ? new eu.dnetlib.broker.objects.Dataset() - .setOriginalId(d.getOriginalId().get(0)) - .setTitle(structPropValue(d.getTitle())) - .setPids(d.getPid().stream().map(ConversionUtils::oafPidToBrokerPid).collect(Collectors.toList())) - .setInstances( - d - .getInstance() - .stream() - .map(ConversionUtils::oafInstanceToBrokerInstances) - .flatMap(List::stream) - .collect(Collectors.toList())) - .setCollectedFrom(d.getCollectedfrom().stream().map(KeyValue::getValue).findFirst().orElse(null)) - : null; + public static final OaBrokerRelatedDataset oafDatasetToBrokerDataset(final Dataset d) { + if (d == null) { + return null; + } + + final OaBrokerRelatedDataset res = new OaBrokerRelatedDataset(); + res.setOpenaireId(d.getId()); + res.setOriginalId(first(d.getOriginalId())); + res.setTitle(structPropValue(d.getTitle())); + res.setPids(mappedList(d.getPid(), ConversionUtils::oafPidToBrokerPid)); + res.setInstances(flatMappedList(d.getInstance(), ConversionUtils::oafInstanceToBrokerInstances)); + res.setCollectedFrom(mappedFirst(d.getCollectedfrom(), KeyValue::getValue)); + return res; } - public static eu.dnetlib.broker.objects.Publication oafPublicationToBrokerPublication(final Publication p) { - return p != null ? new eu.dnetlib.broker.objects.Publication() - .setOriginalId(p.getOriginalId().get(0)) - .setTitle(structPropValue(p.getTitle())) - .setPids(p.getPid().stream().map(ConversionUtils::oafPidToBrokerPid).collect(Collectors.toList())) - .setInstances( - p - .getInstance() - .stream() - .map(ConversionUtils::oafInstanceToBrokerInstances) - .flatMap(List::stream) - .collect(Collectors.toList())) - .setCollectedFrom(p.getCollectedfrom().stream().map(KeyValue::getValue).findFirst().orElse(null)) - : null; - } - - public static final OpenaireBrokerResult oafResultToBrokerResult(final Result result) { - - return result != null ? new OpenaireBrokerResult() - .setOpenaireId(result.getId()) - .setOriginalId(result.getOriginalId().get(0)) - .setTypology(result.getResulttype().getClassid()) - .setTitles(structPropList(result.getTitle())) - .setAbstracts(fieldList(result.getDescription())) - .setLanguage(result.getLanguage().getClassid()) - .setSubjects(structPropTypedList(result.getSubject())) - .setCreators( - result.getAuthor().stream().map(ConversionUtils::oafAuthorToBrokerAuthor).collect(Collectors.toList())) - .setPublicationdate(result.getDateofacceptance().getValue()) - .setPublisher(fieldValue(result.getPublisher())) - .setEmbargoenddate(fieldValue(result.getEmbargoenddate())) - .setContributor(fieldList(result.getContributor())) - .setJournal( - result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null) - .setCollectedFromId(result.getCollectedfrom().stream().map(KeyValue::getKey).findFirst().orElse(null)) - .setCollectedFromName(result.getCollectedfrom().stream().map(KeyValue::getValue).findFirst().orElse(null)) - .setPids(result.getPid().stream().map(ConversionUtils::oafPidToBrokerPid).collect(Collectors.toList())) - .setInstances( - result - .getInstance() - .stream() - .map(ConversionUtils::oafInstanceToBrokerInstances) - .flatMap(List::stream) - .collect(Collectors.toList())) - .setExternalReferences( - result - .getExternalReference() - .stream() - .map(ConversionUtils::oafExtRefToBrokerExtRef) - .collect(Collectors.toList())) - : null; - } - - private static List structPropTypedList(final List list) { - return list - .stream() - .map( - p -> new TypedValue() - .setValue(p.getValue()) - .setType(p.getQualifier().getClassid())) - .collect(Collectors.toList()); - } - - private static eu.dnetlib.broker.objects.Author oafAuthorToBrokerAuthor(final Author author) { - return author != null ? new eu.dnetlib.broker.objects.Author() - .setFullname(author.getFullname()) - .setOrcid( - author - .getPid() - .stream() - .filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase("orcid")) - .map(pid -> pid.getValue()) - .findFirst() - .orElse(null)) - : null; - } - - private static eu.dnetlib.broker.objects.Journal oafJournalToBrokerJournal(final Journal journal) { - return journal != null ? new eu.dnetlib.broker.objects.Journal() - .setName(journal.getName()) - .setIssn(journal.getIssnPrinted()) - .setEissn(journal.getIssnOnline()) - .setLissn(journal.getIssnLinking()) : null; - } - - private static eu.dnetlib.broker.objects.ExternalReference oafExtRefToBrokerExtRef(final ExternalReference ref) { - return ref != null ? new eu.dnetlib.broker.objects.ExternalReference() - .setRefidentifier(ref.getRefidentifier()) - .setSitename(ref.getSitename()) - .setType(ref.getQualifier().getClassid()) - .setUrl(ref.getUrl()) - : null; - } - - public static final eu.dnetlib.broker.objects.Project oafProjectToBrokerProject(final Project p) { + public static OaBrokerRelatedPublication oafPublicationToBrokerPublication(final Publication p) { if (p == null) { return null; } - final eu.dnetlib.broker.objects.Project res = new eu.dnetlib.broker.objects.Project() - .setTitle(fieldValue(p.getTitle())) - .setAcronym(fieldValue(p.getAcronym())) - .setCode(fieldValue(p.getCode())); + final OaBrokerRelatedPublication res = new OaBrokerRelatedPublication(); + res.setOpenaireId(p.getId()); + res.setOriginalId(first(p.getOriginalId())); + res.setTitle(structPropValue(p.getTitle())); + res.setPids(mappedList(p.getPid(), ConversionUtils::oafPidToBrokerPid)); + res.setInstances(flatMappedList(p.getInstance(), ConversionUtils::oafInstanceToBrokerInstances)); + res.setCollectedFrom(mappedFirst(p.getCollectedfrom(), KeyValue::getValue)); + + return res; + } + + public static final OaBrokerMainEntity oafResultToBrokerResult(final Result result) { + if (result == null) { + return null; + } + + final OaBrokerMainEntity res = new OaBrokerMainEntity(); + + res.setOpenaireId(result.getId()); + res.setOriginalId(first(result.getOriginalId())); + res.setTypology(classId(result.getResulttype())); + res.setTitles(structPropList(result.getTitle())); + res.setAbstracts(fieldList(result.getDescription())); + res.setLanguage(classId(result.getLanguage())); + res.setSubjects(structPropTypedList(result.getSubject())); + res.setCreators(mappedList(result.getAuthor(), ConversionUtils::oafAuthorToBrokerAuthor)); + res.setPublicationdate(fieldValue(result.getDateofacceptance())); + res.setPublisher(fieldValue(result.getPublisher())); + res.setEmbargoenddate(fieldValue(result.getEmbargoenddate())); + res.setContributor(fieldList(result.getContributor())); + res + .setJournal( + result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null); + res.setCollectedFromId(mappedFirst(result.getCollectedfrom(), KeyValue::getKey)); + res.setCollectedFromName(mappedFirst(result.getCollectedfrom(), KeyValue::getValue)); + res.setPids(mappedList(result.getPid(), ConversionUtils::oafPidToBrokerPid)); + res.setInstances(flatMappedList(result.getInstance(), ConversionUtils::oafInstanceToBrokerInstances)); + res.setExternalReferences(mappedList(result.getExternalReference(), ConversionUtils::oafExtRefToBrokerExtRef)); + + return res; + } + + private static OaBrokerAuthor oafAuthorToBrokerAuthor(final Author author) { + if (author == null) { + return null; + } + + final String pids = author.getPid() != null ? author + .getPid() + .stream() + .filter(pid -> pid != null) + .filter(pid -> pid.getQualifier() != null) + .filter(pid -> pid.getQualifier().getClassid() != null) + .filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase("orcid")) + .map(pid -> pid.getValue()) + .filter(StringUtils::isNotBlank) + .findFirst() + .orElse(null) : null; + + return new OaBrokerAuthor(author.getFullname(), pids); + } + + private static OaBrokerJournal oafJournalToBrokerJournal(final Journal journal) { + if (journal == null) { + return null; + } + + final OaBrokerJournal res = new OaBrokerJournal(); + res.setName(journal.getName()); + res.setIssn(journal.getIssnPrinted()); + res.setEissn(journal.getIssnOnline()); + res.setLissn(journal.getIssnLinking()); + + return res; + } + + private static OaBrokerExternalReference oafExtRefToBrokerExtRef(final ExternalReference ref) { + if (ref == null) { + return null; + } + + final OaBrokerExternalReference res = new OaBrokerExternalReference(); + res.setRefidentifier(ref.getRefidentifier()); + res.setSitename(ref.getSitename()); + res.setType(classId(ref.getQualifier())); + res.setUrl(ref.getUrl()); + return res; + } + + public static final OaBrokerProject oafProjectToBrokerProject(final Project p) { + if (p == null) { + return null; + } + + final OaBrokerProject res = new OaBrokerProject(); + res.setOpenaireId(p.getId()); + res.setTitle(fieldValue(p.getTitle())); + res.setAcronym(fieldValue(p.getAcronym())); + res.setCode(fieldValue(p.getCode())); final String ftree = fieldValue(p.getFundingtree()); if (StringUtils.isNotBlank(ftree)) { @@ -187,13 +201,27 @@ public class ConversionUtils { return res; } - public static final eu.dnetlib.broker.objects.Software oafSoftwareToBrokerSoftware(final Software sw) { - return sw != null ? new eu.dnetlib.broker.objects.Software() - .setName(structPropValue(sw.getTitle())) - .setDescription(fieldValue(sw.getDescription())) - .setRepository(fieldValue(sw.getCodeRepositoryUrl())) - .setLandingPage(fieldValue(sw.getDocumentationUrl())) - : null; + public static final OaBrokerRelatedSoftware oafSoftwareToBrokerSoftware(final Software sw) { + if (sw == null) { + return null; + } + + final OaBrokerRelatedSoftware res = new OaBrokerRelatedSoftware(); + res.setOpenaireId(sw.getId()); + res.setName(structPropValue(sw.getTitle())); + res.setDescription(fieldValue(sw.getDescription())); + res.setRepository(fieldValue(sw.getCodeRepositoryUrl())); + res.setLandingPage(fieldValue(sw.getDocumentationUrl())); + + return res; + } + + private static String first(final List list) { + return list != null && list.size() > 0 ? list.get(0) : null; + } + + private static String kvValue(final KeyValue kv) { + return kv != null ? kv.getValue() : null; } private static String fieldValue(final Field f) { @@ -205,6 +233,10 @@ public class ConversionUtils { : null; } + private static String classId(final Qualifier q) { + return q != null ? q.getClassid() : null; + } + private static String structPropValue(final List props) { return props != null ? props.stream().map(StructuredProperty::getValue).filter(StringUtils::isNotBlank).findFirst().orElse(null) @@ -226,4 +258,55 @@ public class ConversionUtils { .collect(Collectors.toList()) : new ArrayList<>(); } + + private static List structPropTypedList(final List list) { + if (list == null) { + return new ArrayList<>(); + } + + return list + .stream() + .map(ConversionUtils::oafStructPropToBrokerTypedValue) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } + + private static List mappedList(final List list, final Function func) { + if (list == null) { + return new ArrayList<>(); + } + + return list + .stream() + .map(func::apply) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } + + private static List flatMappedList(final List list, final Function> func) { + if (list == null) { + return new ArrayList<>(); + } + + return list + .stream() + .map(func::apply) + .flatMap(List::stream) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } + + private static T mappedFirst(final List list, final Function func) { + if (list == null) { + return null; + } + + return list + .stream() + .map(func::apply) + .filter(Objects::nonNull) + .findFirst() + .orElse(null); + } + } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java index 4c20ac5cae..7451e58910 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java @@ -4,7 +4,7 @@ package eu.dnetlib.dhp.broker.oa.util; import java.util.ArrayList; import java.util.List; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.EventFactory; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsReferencedBy; @@ -68,7 +68,7 @@ public class EventFinder { public static EventGroup generateEvents(final ResultGroup results, final DedupConfig dedupConfig) { final List> list = new ArrayList<>(); - for (final OpenaireBrokerResult target : results.getData()) { + for (final OaBrokerMainEntity target : results.getData()) { for (final UpdateMatcher matcher : matchers) { list.addAll(matcher.searchUpdatesForRecord(target, results.getData(), dedupConfig)); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java index 82d0178648..25d0d2bcad 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java @@ -9,10 +9,10 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.broker.objects.Instance; -import eu.dnetlib.broker.objects.OpenAireEventPayload; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.Provenance; +import eu.dnetlib.broker.objects.OaBrokerEventPayload; +import eu.dnetlib.broker.objects.OaBrokerInstance; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerProvenance; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; @@ -25,11 +25,11 @@ public final class UpdateInfo { private final T highlightValue; - private final OpenaireBrokerResult source; + private final OaBrokerMainEntity source; - private final OpenaireBrokerResult target; + private final OaBrokerMainEntity target; - private final BiConsumer compileHighlight; + private final BiConsumer compileHighlight; private final Function highlightToString; @@ -37,9 +37,9 @@ public final class UpdateInfo { private static final Logger log = LoggerFactory.getLogger(UpdateInfo.class); - public UpdateInfo(final Topic topic, final T highlightValue, final OpenaireBrokerResult source, - final OpenaireBrokerResult target, - final BiConsumer compileHighlight, + public UpdateInfo(final Topic topic, final T highlightValue, final OaBrokerMainEntity source, + final OaBrokerMainEntity target, + final BiConsumer compileHighlight, final Function highlightToString, final DedupConfig dedupConfig) { this.topic = topic; @@ -55,16 +55,22 @@ public final class UpdateInfo { return highlightValue; } - public OpenaireBrokerResult getSource() { + public OaBrokerMainEntity getSource() { return source; } - public OpenaireBrokerResult getTarget() { + public OaBrokerMainEntity getTarget() { return target; } - private float calculateTrust(final DedupConfig dedupConfig, final OpenaireBrokerResult r1, - final OpenaireBrokerResult r2) { + private float calculateTrust(final DedupConfig dedupConfig, + final OaBrokerMainEntity r1, + final OaBrokerMainEntity r2) { + + if (dedupConfig == null) { + return BrokerConstants.MIN_TRUST; + } + try { final ObjectMapper objectMapper = new ObjectMapper(); final MapDocument doc1 = MapDocumentUtil @@ -98,11 +104,11 @@ public final class UpdateInfo { return highlightToString.apply(getHighlightValue()); } - public OpenAireEventPayload asBrokerPayload() { + public OaBrokerEventPayload asBrokerPayload() { compileHighlight.accept(target, getHighlightValue()); - final OpenaireBrokerResult hl = new OpenaireBrokerResult(); + final OaBrokerMainEntity hl = new OaBrokerMainEntity(); compileHighlight.accept(hl, getHighlightValue()); final String provId = getSource().getOriginalId(); @@ -111,18 +117,20 @@ public final class UpdateInfo { final String provUrl = getSource() .getInstances() .stream() - .map(Instance::getUrl) + .map(OaBrokerInstance::getUrl) .findFirst() .orElse(null); ; - final Provenance provenance = new Provenance().setId(provId).setRepositoryName(provRepo).setUrl(provUrl); + final OaBrokerProvenance provenance = new OaBrokerProvenance(provId, provRepo, provUrl); - return new OpenAireEventPayload() - .setPublication(target) - .setHighlight(hl) - .setTrust(trust) - .setProvenance(provenance); + final OaBrokerEventPayload res = new OaBrokerEventPayload(); + res.setResult(target); + res.setHighlight(hl); + res.setTrust(trust); + res.setProvenance(provenance); + + return res; } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultAggregator.java index dabe2bb4d6..ee1c8963e8 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultAggregator.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultAggregator.java @@ -5,11 +5,11 @@ import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.expressions.Aggregator; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.schema.oaf.Relation; import scala.Tuple2; -public class ResultAggregator extends Aggregator, ResultGroup, ResultGroup> { +public class ResultAggregator extends Aggregator, ResultGroup, ResultGroup> { /** * @@ -22,13 +22,15 @@ public class ResultAggregator extends Aggregator t) { - return group.addElement(t._1); + public ResultGroup reduce(final ResultGroup group, final Tuple2 t) { + group.getData().add(t._1); + return group; } @Override public ResultGroup merge(final ResultGroup g1, final ResultGroup g2) { - return g1.addGroup(g2); + g1.getData().addAll(g2.getData()); + return g1; } @Override @@ -38,13 +40,13 @@ public class ResultAggregator extends Aggregator bufferEncoder() { - return Encoders.kryo(ResultGroup.class); + return Encoders.bean(ResultGroup.class); } @Override public Encoder outputEncoder() { - return Encoders.kryo(ResultGroup.class); + return Encoders.bean(ResultGroup.class); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultGroup.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultGroup.java index 4308224a58..e718e0f1c4 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultGroup.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultGroup.java @@ -5,7 +5,7 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.List; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; public class ResultGroup implements Serializable { @@ -14,23 +14,14 @@ public class ResultGroup implements Serializable { */ private static final long serialVersionUID = -3360828477088669296L; - private final List data = new ArrayList<>(); + private List data = new ArrayList<>(); - public List getData() { + public List getData() { return data; } - public ResultGroup addElement(final OpenaireBrokerResult elem) { - data.add(elem); - return this; + public void setData(final List data) { + this.data = data; } - public ResultGroup addGroup(final ResultGroup group) { - data.addAll(group.getData()); - return this; - } - - public boolean isValid() { - return data.size() > 1; - } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/OpenaireBrokerResultAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/OaBrokerMainEntityAggregator.java similarity index 59% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/OpenaireBrokerResultAggregator.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/OaBrokerMainEntityAggregator.java index b44fbe367c..6a2d9b06dc 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/OpenaireBrokerResultAggregator.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/OaBrokerMainEntityAggregator.java @@ -5,11 +5,11 @@ import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.expressions.Aggregator; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import scala.Tuple2; -public class OpenaireBrokerResultAggregator - extends Aggregator, OpenaireBrokerResult, OpenaireBrokerResult> { +public class OaBrokerMainEntityAggregator + extends Aggregator, OaBrokerMainEntity, OaBrokerMainEntity> { /** * @@ -17,17 +17,17 @@ public class OpenaireBrokerResultAggregator private static final long serialVersionUID = -3687878788861013488L; @Override - public OpenaireBrokerResult zero() { - return new OpenaireBrokerResult(); + public OaBrokerMainEntity zero() { + return new OaBrokerMainEntity(); } @Override - public OpenaireBrokerResult finish(final OpenaireBrokerResult g) { + public OaBrokerMainEntity finish(final OaBrokerMainEntity g) { return g; } @Override - public OpenaireBrokerResult reduce(final OpenaireBrokerResult g, final Tuple2 t) { + public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2 t) { if (g.getOriginalId() == null) { return t._1; } else if (t._2 instanceof RelatedSoftware) { @@ -38,13 +38,15 @@ public class OpenaireBrokerResultAggregator g.getPublications().add(((RelatedPublication) t._2).getRelPublication()); } else if (t._2 instanceof RelatedProject) { g.getProjects().add(((RelatedProject) t._2).getRelProject()); + } else { + throw new RuntimeException("Invalid Object: " + t._2.getClass()); } return g; } @Override - public OpenaireBrokerResult merge(final OpenaireBrokerResult g1, final OpenaireBrokerResult g2) { + public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { if (g1.getOriginalId() != null) { g1.getSoftwares().addAll(g2.getSoftwares()); g1.getDatasets().addAll(g2.getDatasets()); @@ -57,13 +59,13 @@ public class OpenaireBrokerResultAggregator } @Override - public Encoder bufferEncoder() { - return Encoders.kryo(OpenaireBrokerResult.class); + public Encoder bufferEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); } @Override - public Encoder outputEncoder() { - return Encoders.kryo(OpenaireBrokerResult.class); + public Encoder outputEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java index fcf1b89b1c..daf75ea2ef 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; import java.io.Serializable; -import eu.dnetlib.broker.objects.Dataset; +import eu.dnetlib.broker.objects.OaBrokerRelatedDataset; public class RelatedDataset implements Serializable { @@ -11,11 +11,14 @@ public class RelatedDataset implements Serializable { * */ private static final long serialVersionUID = 774487705184038324L; - private final String source; - private final String relType; - private final Dataset relDataset; + private String source; + private String relType; + private OaBrokerRelatedDataset relDataset; - public RelatedDataset(final String source, final String relType, final Dataset relDataset) { + public RelatedDataset() { + } + + public RelatedDataset(final String source, final String relType, final OaBrokerRelatedDataset relDataset) { this.source = source; this.relType = relType; this.relDataset = relDataset; @@ -25,12 +28,24 @@ public class RelatedDataset implements Serializable { return source; } + public void setSource(final String source) { + this.source = source; + } + public String getRelType() { return relType; } - public Dataset getRelDataset() { + public void setRelType(final String relType) { + this.relType = relType; + } + + public OaBrokerRelatedDataset getRelDataset() { return relDataset; } + public void setRelDataset(final OaBrokerRelatedDataset relDataset) { + this.relDataset = relDataset; + } + } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedEntityFactory.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedEntityFactory.java deleted file mode 100644 index c60d4f1417..0000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedEntityFactory.java +++ /dev/null @@ -1,34 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; - -import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; -import eu.dnetlib.dhp.schema.oaf.Dataset; -import eu.dnetlib.dhp.schema.oaf.Project; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.schema.oaf.Software; - -public class RelatedEntityFactory { - - @SuppressWarnings("unchecked") - public static RT newRelatedEntity(final String sourceId, - final String relType, - final T target, - final Class clazz) { - - if (clazz == RelatedProject.class) { - return (RT) new RelatedProject(sourceId, relType, - ConversionUtils.oafProjectToBrokerProject((Project) target)); - } else if (clazz == RelatedSoftware.class) { - return (RT) new RelatedSoftware(sourceId, relType, - ConversionUtils.oafSoftwareToBrokerSoftware((Software) target)); - } else if (clazz == RelatedDataset.class) { - return (RT) new RelatedDataset(sourceId, relType, - ConversionUtils.oafDatasetToBrokerDataset((Dataset) target)); - } else if (clazz == RelatedPublication.class) { - return (RT) new RelatedPublication(sourceId, relType, - ConversionUtils.oafPublicationToBrokerPublication((Publication) target)); - } else { - return null; - } - } -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java index 233041c096..4116c8c77b 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; import java.io.Serializable; -import eu.dnetlib.broker.objects.Project; +import eu.dnetlib.broker.objects.OaBrokerProject; public class RelatedProject implements Serializable { @@ -12,11 +12,14 @@ public class RelatedProject implements Serializable { */ private static final long serialVersionUID = 4941437626549329870L; - private final String source; - private final String relType; - private final Project relProject; + private String source; + private String relType; + private OaBrokerProject relProject; - public RelatedProject(final String source, final String relType, final Project relProject) { + public RelatedProject() { + } + + public RelatedProject(final String source, final String relType, final OaBrokerProject relProject) { this.source = source; this.relType = relType; this.relProject = relProject; @@ -26,12 +29,24 @@ public class RelatedProject implements Serializable { return source; } + public void setSource(final String source) { + this.source = source; + } + public String getRelType() { return relType; } - public Project getRelProject() { + public void setRelType(final String relType) { + this.relType = relType; + } + + public OaBrokerProject getRelProject() { return relProject; } + public void setRelProject(final OaBrokerProject relProject) { + this.relProject = relProject; + } + } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java index 80b92462d1..9e222a9523 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; import java.io.Serializable; -import eu.dnetlib.broker.objects.Publication; +import eu.dnetlib.broker.objects.OaBrokerRelatedPublication; public class RelatedPublication implements Serializable { @@ -12,11 +12,15 @@ public class RelatedPublication implements Serializable { */ private static final long serialVersionUID = 9021609640411395128L; - private final String source; - private final String relType; - private final Publication relPublication; + private String source; + private String relType; + private OaBrokerRelatedPublication relPublication; - public RelatedPublication(final String source, final String relType, final Publication relPublication) { + public RelatedPublication() { + } + + public RelatedPublication(final String source, final String relType, + final OaBrokerRelatedPublication relPublication) { this.source = source; this.relType = relType; this.relPublication = relPublication; @@ -26,12 +30,24 @@ public class RelatedPublication implements Serializable { return source; } + public void setSource(final String source) { + this.source = source; + } + public String getRelType() { return relType; } - public Publication getRelPublication() { + public void setRelType(final String relType) { + this.relType = relType; + } + + public OaBrokerRelatedPublication getRelPublication() { return relPublication; } + public void setRelPublication(final OaBrokerRelatedPublication relPublication) { + this.relPublication = relPublication; + } + } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java index 13f1f42909..2f3b8668c2 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; import java.io.Serializable; -import eu.dnetlib.broker.objects.Software; +import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware; public class RelatedSoftware implements Serializable { @@ -11,11 +11,14 @@ public class RelatedSoftware implements Serializable { * */ private static final long serialVersionUID = 7573383356943300157L; - private final String source; - private final String relType; - private final Software relSoftware; + private String source; + private String relType; + private OaBrokerRelatedSoftware relSoftware; - public RelatedSoftware(final String source, final String relType, final Software relSoftware) { + public RelatedSoftware() { + } + + public RelatedSoftware(final String source, final String relType, final OaBrokerRelatedSoftware relSoftware) { this.source = source; this.relType = relType; this.relSoftware = relSoftware; @@ -25,12 +28,24 @@ public class RelatedSoftware implements Serializable { return source; } + public void setSource(final String source) { + this.source = source; + } + public String getRelType() { return relType; } - public Software getRelSoftware() { + public void setRelType(final String relType) { + this.relType = relType; + } + + public OaBrokerRelatedSoftware getRelSoftware() { return relSoftware; } + public void setRelSoftware(final OaBrokerRelatedSoftware relSoftware) { + this.relSoftware = relSoftware; + } + } diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml index da573ae9cd..ea9aabcfce 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml @@ -78,21 +78,33 @@ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + - - - - - eu.dnetlib.dhp.broker.oa.GenerateEventsApplication + + yarn + cluster + GenerateEvents + eu.dnetlib.dhp.broker.oa.GenerateEventsApplication + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + --graphPath${graphInputPath} --eventsPath${eventsOutputPath} --isLookupUrl${isLookupUrl} --dedupConfProfile${dedupConfProfId} - + + diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_relations.json b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_relations.json new file mode 100644 index 0000000000..32fd1d8f32 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_relations.json @@ -0,0 +1,14 @@ +[ + { + "paramName": "g", + "paramLongName": "graphPath", + "paramDescription": "the path where there the graph is stored", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "relsPath", + "paramDescription": "the path where the generated relations will be stored", + "paramRequired": true + } +] diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_simple_entities.json b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_simple_entities.json new file mode 100644 index 0000000000..6f5e330f60 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_simple_entities.json @@ -0,0 +1,14 @@ +[ + { + "paramName": "g", + "paramLongName": "graphPath", + "paramDescription": "the path where there the graph is stored", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "simpleEntitiesPath", + "paramDescription": "the path where the generated simple entities (without relations) will be stored", + "paramRequired": true + } +] diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java index b2c7152d5f..8f43ab1cf3 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java @@ -3,9 +3,13 @@ package eu.dnetlib.dhp.oa.graph.clean; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import java.io.BufferedInputStream; +import java.util.Objects; import java.util.Optional; +import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; @@ -19,7 +23,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @@ -84,12 +90,86 @@ public class CleanGraphSparkJob { readTableFromPath(spark, inputPath, clazz) .map((MapFunction) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz)) + .map((MapFunction) value -> fixDefaults(value), Encoders.bean(clazz)) .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(outputPath); } + private static T fixDefaults(T value) { + if (value instanceof Datasource) { + // nothing to clean here + } else if (value instanceof Project) { + // nothing to clean here + } else if (value instanceof Organization) { + Organization o = (Organization) value; + if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) { + o.setCountry(qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_COUNTRY_TYPE)); + } + } else if (value instanceof Relation) { + // nothing to clean here + } else if (value instanceof Result) { + + Result r = (Result) value; + if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) { + r + .setLanguage( + qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES)); + } + if (Objects.nonNull(r.getSubject())) { + r + .setSubject( + r + .getSubject() + .stream() + .filter(Objects::nonNull) + .filter(sp -> StringUtils.isNotBlank(sp.getValue())) + .filter(sp -> Objects.nonNull(sp.getQualifier())) + .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) + .collect(Collectors.toList())); + } + if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) { + r + .setResourcetype( + qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE)); + } + if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) { + r + .setBestaccessright( + qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES)); + } + if (Objects.nonNull(r.getInstance())) { + for (Instance i : r.getInstance()) { + if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) { + i.setAccessright(qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES)); + } + if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) { + i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY); + } + } + } + + if (value instanceof Publication) { + + } else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) { + + } else if (value instanceof OtherResearchProduct) { + + } else if (value instanceof Software) { + + } + } + + return value; + } + + private static Qualifier qualifier(String classid, String classname, String scheme) { + return OafMapperUtils + .qualifier( + classid, classname, scheme, scheme); + } + private static Dataset readTableFromPath( SparkSession spark, String inputEntityPath, Class clazz) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java index 8006f73009..d2d4e118fd 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java @@ -4,10 +4,13 @@ package eu.dnetlib.dhp.oa.graph.clean; import java.io.Serializable; import java.util.HashMap; +import org.apache.commons.lang3.StringUtils; + import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.Country; import eu.dnetlib.dhp.schema.oaf.Qualifier; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; public class CleaningRuleMap extends HashMap> implements Serializable { @@ -18,23 +21,24 @@ public class CleaningRuleMap extends HashMap */ public static CleaningRuleMap create(VocabularyGroup vocabularies) { CleaningRuleMap mapping = new CleaningRuleMap(); - mapping.put(Qualifier.class, o -> { - Qualifier q = (Qualifier) o; - if (vocabularies.vocabularyExists(q.getSchemeid())) { - Qualifier newValue = vocabularies.lookup(q.getSchemeid(), q.getClassid()); - q.setClassid(newValue.getClassid()); - q.setClassname(newValue.getClassname()); + mapping.put(Qualifier.class, o -> cleanQualifier(vocabularies, (Qualifier) o)); + mapping.put(Country.class, o -> { + final Country c = (Country) o; + if (StringUtils.isBlank(c.getSchemeid())) { + c.setSchemeid(ModelConstants.DNET_COUNTRY_TYPE); + c.setSchemename(ModelConstants.DNET_COUNTRY_TYPE); } - }); - mapping.put(StructuredProperty.class, o -> { - StructuredProperty sp = (StructuredProperty) o; - // TODO implement a policy - /* - * if (StringUtils.isBlank(sp.getValue())) { sp.setValue(null); sp.setQualifier(null); sp.setDataInfo(null); - * } - */ + cleanQualifier(vocabularies, c); }); return mapping; } + private static void cleanQualifier(VocabularyGroup vocabularies, Q q) { + if (vocabularies.vocabularyExists(q.getSchemeid())) { + Qualifier newValue = vocabularies.lookup(q.getSchemeid(), q.getClassid()); + q.setClassid(newValue.getClassid()); + q.setClassname(newValue.getClassname()); + } + } + } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java index d9ff62596a..334339d3be 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java @@ -122,7 +122,11 @@ public class VocabularyGroup implements Serializable { } public boolean vocabularyExists(final String vocId) { - return vocs.containsKey(vocId.toLowerCase()); + return Optional + .ofNullable(vocId) + .map(String::toLowerCase) + .map(id -> vocs.containsKey(id)) + .orElse(false); } private void addSynonyms(final String vocId, final String termId, final String syn) { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java index 1b21ce2d37..4783aa81f6 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java @@ -21,6 +21,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @@ -56,6 +57,9 @@ public class CleaningFunctionTest { String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json")); Publication p_in = MAPPER.readValue(json, Publication.class); + assertTrue(p_in instanceof Result); + assertTrue(p_in instanceof Publication); + Publication p_out = OafCleaner.apply(p_in, mapping); assertNotNull(p_out); @@ -63,6 +67,9 @@ public class CleaningFunctionTest { assertEquals("und", p_out.getLanguage().getClassid()); assertEquals("Undetermined", p_out.getLanguage().getClassname()); + assertEquals("DE", p_out.getCountry().get(0).getClassid()); + assertEquals("Germany", p_out.getCountry().get(0).getClassname()); + assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid()); assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname()); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index b63a12f619..2c1d5017d6 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -202,6 +202,12 @@ "contributor": [ ], "country": [ + { + "classid": "DE", + "classname": "DE", + "schemeid": "dnet:countries", + "schemename": "dnet:countries" + } ], "coverage": [ ], diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala index 5d7c444b2c..637362acf0 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala @@ -4,9 +4,12 @@ import java.time.LocalDateTime import java.time.format.DateTimeFormatter import eu.dnetlib.dhp.common.PacePerson -import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Dataset, Field, Instance, KeyValue, Publication, Qualifier, Relation, StructuredProperty} +import eu.dnetlib.dhp.schema.action.AtomicAction +import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, StructuredProperty} import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIRelation} +import eu.dnetlib.dhp.utils.DHPUtils import org.apache.commons.lang3.StringUtils +import org.codehaus.jackson.map.ObjectMapper import scala.collection.JavaConverters._ @@ -77,6 +80,76 @@ object DLIToOAF { ) + val rel_inverse: Map[String, String] = Map( + "isRelatedTo" -> "isRelatedTo", + "IsSupplementedBy" -> "isSupplementTo", + "cites" -> "IsCitedBy", + "IsCitedBy" -> "cites", + "reviews" -> "IsReviewedBy" + ) + + + val PidTypeMap: Map[String, String] = Map( + "pbmid" -> "pmid", + "pmcid" -> "pmc", + "pmid" -> "pmid", + "pubmedid" -> "pmid", + "DOI" -> "doi", + "doi" -> "doi" + ) + + + def toActionSet(item: Oaf): (String, String) = { + val mapper = new ObjectMapper() + + item match { + case dataset: Dataset => + val a: AtomicAction[Dataset] = new AtomicAction[Dataset] + a.setClazz(classOf[Dataset]) + a.setPayload(dataset) + (dataset.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case publication: Publication => + val a: AtomicAction[Publication] = new AtomicAction[Publication] + a.setClazz(classOf[Publication]) + a.setPayload(publication) + (publication.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case relation: Relation => + val a: AtomicAction[Relation] = new AtomicAction[Relation] + a.setClazz(classOf[Relation]) + a.setPayload(relation) + (relation.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case _ => + null + } + } + + def convertClinicalTrial(dataset: DLIDataset): (String, String) = { + val currentId = generateId(dataset.getId) + val pids = dataset.getPid.asScala.filter(p => "clinicaltrials.gov".equalsIgnoreCase(p.getQualifier.getClassname)).map(p => s"50|r3111dacbab5::${DHPUtils.md5(p.getValue.toLowerCase())}") + if (pids.isEmpty) + null + else + (currentId, pids.head) + } + + + def insertExternalRefs(publication: Publication, externalReferences: List[DLIExternalReference]): Publication = { + + val eRefs = externalReferences.map(e => { + val result = new ExternalReference() + result.setSitename(e.sitename) + result.setLabel(e.label) + result.setUrl(e.url) + result.setRefidentifier(e.pid) + result.setDataInfo(generateDataInfo()) + result.setQualifier(createQualifier(e.classId, "dnet:externalReference_typologies")) + result + }) + publication.setExternalReference(eRefs.asJava) + publication + + } + def filterPid(p: StructuredProperty): Boolean = { if (expectecdPidType.contains(p.getQualifier.getClassname) && p.getQualifier.getClassname.equalsIgnoreCase("url")) if (filteredURL.exists(u => p.getValue.contains(u))) @@ -97,7 +170,6 @@ object DLIToOAF { } def convertDLIDatasetToExternalReference(dataset: DLIDataset): DLIExternalReference = { - val currentId = generateId(dataset.getId) val pids = dataset.getPid.asScala.filter(filterPid) if (pids == null || pids.isEmpty) @@ -109,7 +181,7 @@ object DLIToOAF { pid.getQualifier.getClassname match { case "uniprot" => DLIExternalReference(generateId(dataset.getId), s"https://www.uniprot.org/uniprot/${pid.getValue}", "UniProt", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber") case "ena" => - if(pid.getValue!= null && pid.getValue.nonEmpty && pid.getValue.length>7) + if (pid.getValue != null && pid.getValue.nonEmpty && pid.getValue.length > 7) DLIExternalReference(generateId(dataset.getId), s"https://www.ebi.ac.uk/ena/data/view/${pid.getValue.substring(0, 8)}", "European Nucleotide Archive", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber") else null @@ -126,43 +198,50 @@ object DLIToOAF { } - def convertDLIPublicationToOAF(p: DLIPublication): Publication = { - + def convertDLIPublicationToOAF(inputPublication: DLIPublication): Publication = { val result = new Publication - result.setId(generateId(p.getId)) + val cleanedPids = inputPublication.getPid.asScala.filter(p => PidTypeMap.contains(p.getQualifier.getClassid)) + .map(p => { + p.setQualifier(createQualifier(PidTypeMap(p.getQualifier.getClassid), p.getQualifier.getSchemeid)) + p + }) + if (cleanedPids.isEmpty) + return null + result.setId(generateId(inputPublication.getId)) result.setDataInfo(generateDataInfo(invisibile = true)) - if (p.getCollectedfrom == null || p.getCollectedfrom.size() == 0 || (p.getCollectedfrom.size() == 1 && p.getCollectedfrom.get(0) == null)) + if (inputPublication.getCollectedfrom == null || inputPublication.getCollectedfrom.size() == 0 || (inputPublication.getCollectedfrom.size() == 1 && inputPublication.getCollectedfrom.get(0) == null)) return null - - result.setCollectedfrom(p.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).asJava) - result.setPid(p.getPid) - result.setDateofcollection(p.getDateofcollection) - result.setOriginalId(p.getPid.asScala.map(p => p.getValue).asJava) + result.setCollectedfrom(inputPublication.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).filter(p => p != null).asJava) + if(result.getCollectedfrom.isEmpty) + return null + result.setPid(cleanedPids.asJava) + result.setDateofcollection(inputPublication.getDateofcollection) + result.setOriginalId(inputPublication.getPid.asScala.map(p => p.getValue).asJava) result.setDateoftransformation(LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'"))) - if (p.getAuthor == null || p.getAuthor.isEmpty) + if (inputPublication.getAuthor == null || inputPublication.getAuthor.isEmpty) return null - result.setAuthor(p.getAuthor.asScala.map(convertAuthor).asJava) - result.setResulttype(createQualifier(p.getResulttype.getClassid, p.getResulttype.getClassname, "dnet:result_typologies", "dnet:result_typologies")) + result.setAuthor(inputPublication.getAuthor.asScala.map(convertAuthor).asJava) + result.setResulttype(createQualifier(inputPublication.getResulttype.getClassid, inputPublication.getResulttype.getClassname, "dnet:result_typologies", "dnet:result_typologies")) - if (p.getSubject != null) - result.setSubject(p.getSubject.asScala.map(convertSubject).asJava) + if (inputPublication.getSubject != null) + result.setSubject(inputPublication.getSubject.asScala.map(convertSubject).asJava) - if (p.getTitle == null || p.getTitle.isEmpty) + if (inputPublication.getTitle == null || inputPublication.getTitle.isEmpty) return null - result.setTitle(List(patchTitle(p.getTitle.get(0))).asJava) + result.setTitle(List(patchTitle(inputPublication.getTitle.get(0))).asJava) - if (p.getRelevantdate == null || p.getRelevantdate.size() == 0) + if (inputPublication.getRelevantdate == null || inputPublication.getRelevantdate.size() == 0) return null - result.setRelevantdate(p.getRelevantdate.asScala.map(patchRelevantDate).asJava) + result.setRelevantdate(inputPublication.getRelevantdate.asScala.map(patchRelevantDate).asJava) - result.setDescription(p.getDescription) + result.setDescription(inputPublication.getDescription) - result.setDateofacceptance(asField(p.getRelevantdate.get(0).getValue)) - result.setPublisher(p.getPublisher) - result.setSource(p.getSource) + result.setDateofacceptance(asField(inputPublication.getRelevantdate.get(0).getValue)) + result.setPublisher(inputPublication.getPublisher) + result.setSource(inputPublication.getSource) result.setBestaccessright(createQualifier("UNKNOWN", "not available", "dnet:access_modes", "dnet:access_modes")) val dois = result.getPid.asScala.filter(p => "doi".equalsIgnoreCase(p.getQualifier.getClassname)).map(p => p.getValue) @@ -170,7 +249,7 @@ object DLIToOAF { return null - val i: Instance = createInstance(s"https://dx.doi.org/${dois.head}", firstInstanceOrNull(p.getInstance()), result.getDateofacceptance) + val i: Instance = createInstance(s"https://dx.doi.org/${dois.head}", firstInstanceOrNull(inputPublication.getInstance()), result.getDateofacceptance) if (i != null) result.setInstance(List(i).asJava) @@ -211,7 +290,9 @@ object DLIToOAF { val result: Dataset = new Dataset result.setId(generateId(d.getId)) result.setDataInfo(generateDataInfo()) - result.setCollectedfrom(d.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).asJava) + result.setCollectedfrom(d.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).filter(p => p != null).asJava) + if(result.getCollectedfrom.isEmpty) + return null result.setPid(d.getPid) @@ -280,7 +361,7 @@ object DLIToOAF { if (dataset) i.setInstancetype(createQualifier("0021", "Dataset", "dnet:publication_resource", "dnet:publication_resource")) else - i.setInstancetype(createQualifier("0000", "UNKNOWN", "dnet:publication_resource", "dnet:publication_resource")) + i.setInstancetype(createQualifier("0000", "Unknown", "dnet:publication_resource", "dnet:publication_resource")) if (originalInstance != null && originalInstance.getHostedby != null) i.setHostedby(originalInstance.getHostedby) diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala index f3aa35549c..edf951df4d 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala @@ -4,10 +4,16 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset} import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIRelation} import org.apache.commons.io.IOUtils +import org.apache.hadoop.io.Text +import org.apache.hadoop.io.compress.GzipCodec +import org.apache.hadoop.mapred.SequenceFileOutputFormat import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.expressions.Window import org.apache.spark.{SparkConf, SparkContext} import org.codehaus.jackson.map.ObjectMapper + import scala.collection.mutable.ArrayBuffer @@ -36,57 +42,66 @@ object SparkExportContentForOpenAire { implicit val dliRelEncoder: Encoder[DLIRelation] = Encoders.bean(classOf[DLIRelation]) import spark.implicits._ -// -// val relRDD:RDD[Relation] = sc.textFile(s"$workingPath/relation_j") -// .map(s => new ObjectMapper().readValue(s, classOf[DLIRelation])) -// .filter(p => p.getDataInfo.getDeletedbyinference == false) -// .map(DLIToOAF.convertDLIRelation).filter(p=>p!= null) -// spark.createDataset(relRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS") -// -// val datRDD:RDD[OafDataset] = sc.textFile(s"$workingPath/dataset") -// .map(s => new ObjectMapper().readValue(s, classOf[DLIDataset])) -// .filter(p => p.getDataInfo.getDeletedbyinference == false) -// .map(DLIToOAF.convertDLIDatasetTOOAF).filter(p=>p!= null) -// spark.createDataset(datRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/datasetDS") -// -// -// val pubRDD:RDD[Publication] = sc.textFile(s"$workingPath/publication") -// .map(s => new ObjectMapper().readValue(s, classOf[DLIPublication])) -// .filter(p => p.getDataInfo.getDeletedbyinference == false) -// .map(DLIToOAF.convertDLIPublicationToOAF).filter(p=>p!= null) -// spark.createDataset(pubRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationDS") -// -// -// -// val pubs:Dataset[Publication] = spark.read.load(s"$workingPath/publicationDS").as[Publication] -// val dats :Dataset[OafDataset] = spark.read.load(s"$workingPath/datasetDS").as[OafDataset] - var relDS :Dataset[Relation] = spark.read.load(s"$workingPath/relationDS").as[Relation] -// -// -// pubs.joinWith(relDS, pubs("id").equalTo(relDS("source"))).map(k => k._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS_f1") -// -// relDS= spark.read.load(s"$workingPath/relationDS_f1").as[Relation] -// -// relDS.joinWith(dats, relDS("target").equalTo(dats("id"))).map(k => k._1).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS_filtered") -// -// -// val r_source = relDS.select(relDS("source")).distinct() -// val r_target = relDS.select(relDS("source")).distinct() -// -// -// pubs.joinWith(r_source, pubs("id").equalTo(r_source("source")), "inner").map(k => k._1).write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationDS_filtered") -// -// dats.joinWith(r_target, dats("id").equalTo(r_target("target")), "inner").map(k => k._1).write.mode(SaveMode.Overwrite).save(s"$workingPath/datasetDS_filtered") -// -// spark.createDataset(sc.textFile(s"$workingPath/dataset") -// .map(s => new ObjectMapper().readValue(s, classOf[DLIDataset])) -// .map(DLIToOAF.convertDLIDatasetToExternalReference) -// .filter(p => p != null)).as[DLIExternalReference].write.mode(SaveMode.Overwrite).save(s"$workingPath/externalReference") -// + + val relRDD:RDD[Relation] = sc.textFile(s"$workingPath/relation_j") + .map(s => new ObjectMapper().readValue(s, classOf[DLIRelation])) + .filter(p => p.getDataInfo.getDeletedbyinference == false) + .map(DLIToOAF.convertDLIRelation).filter(p=>p!= null) + spark.createDataset(relRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS") + + val datRDD:RDD[OafDataset] = sc.textFile(s"$workingPath/dataset") + .map(s => new ObjectMapper().readValue(s, classOf[DLIDataset])) + .filter(p => p.getDataInfo.getDeletedbyinference == false) + .map(DLIToOAF.convertDLIDatasetTOOAF).filter(p=>p!= null) + spark.createDataset(datRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/datasetDS") + + + val pubRDD:RDD[Publication] = sc.textFile(s"$workingPath/publication") + .map(s => new ObjectMapper().readValue(s, classOf[DLIPublication])) + .filter(p => p.getDataInfo.getDeletedbyinference == false) + .map(DLIToOAF.convertDLIPublicationToOAF).filter(p=>p!= null) + spark.createDataset(pubRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationDS") + + + + val pubs:Dataset[Publication] = spark.read.load(s"$workingPath/publicationDS").as[Publication] + val dats :Dataset[OafDataset] = spark.read.load(s"$workingPath/datasetDS").as[OafDataset] + val relDS1 :Dataset[Relation] = spark.read.load(s"$workingPath/relationDS").as[Relation] + + + val pub_id = pubs.select("id").distinct() + val dat_id = dats.select("id").distinct() + + + pub_id.joinWith(relDS1, pub_id("id").equalTo(relDS1("source"))).map(k => k._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS_f1") + + val relDS2= spark.read.load(s"$workingPath/relationDS_f1").as[Relation] + + relDS2.joinWith(dat_id, relDS2("target").equalTo(dats("id"))).map(k => k._1).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS_filtered") + + + val r_source = relDS2.select(relDS2("source")).distinct() + val r_target = relDS2.select(relDS2("target")).distinct() + + + val w2 = Window.partitionBy("id").orderBy("lastupdatetimestamp") + + pubs.joinWith(r_source, pubs("id").equalTo(r_source("source")), "inner").map(k => k._1) + .withColumn("row",row_number.over(w2)).where($"row" === 1).drop("row") + .write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationDS_filtered") + + dats.joinWith(r_target, dats("id").equalTo(r_target("target")), "inner").map(k => k._1) + .withColumn("row",row_number.over(w2)).where($"row" === 1).drop("row") + .write.mode(SaveMode.Overwrite).save(s"$workingPath/datasetAS") + + spark.createDataset(sc.textFile(s"$workingPath/dataset") + .map(s => new ObjectMapper().readValue(s, classOf[DLIDataset])) + .map(DLIToOAF.convertDLIDatasetToExternalReference) + .filter(p => p != null)).as[DLIExternalReference].write.mode(SaveMode.Overwrite).save(s"$workingPath/externalReference") val pf = spark.read.load(s"$workingPath/publicationDS_filtered").select("id") - relDS = spark.read.load(s"$workingPath/relationDS").as[Relation] - val relationTo = pf.joinWith(relDS, pf("id").equalTo(relDS("source")),"inner").map(t =>t._2) + val relDS3 = spark.read.load(s"$workingPath/relationDS").as[Relation] + val relationTo = pf.joinWith(relDS3, pf("id").equalTo(relDS3("source")),"inner").map(t =>t._2) val extRef = spark.read.load(s"$workingPath/externalReference").as[DLIExternalReference] @@ -100,19 +115,70 @@ object SparkExportContentForOpenAire { (f._1, dli_ext) })).write.mode(SaveMode.Overwrite).save(s"$workingPath/externalReference_grouped") + val pubf :Dataset[Publication] = spark.read.load(s"$workingPath/publicationDS_filtered").as[Publication] + + val groupedERf:Dataset[(String, List[DLIExternalReference])]= spark.read.load(s"$workingPath/externalReference_grouped").as[(String, List[DLIExternalReference])] + + groupedERf.joinWith(pubf,pubf("id").equalTo(groupedERf("_1"))).map(t => + { + val publication = t._2 + if (t._1 != null) { + val eRefs = t._1._2 + DLIToOAF.insertExternalRefs(publication, eRefs) + + } else + publication + } + ).write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationAS") + spark.createDataset(sc.textFile(s"$workingPath/dataset") + .map(s => new ObjectMapper().readValue(s, classOf[DLIDataset])) + .map(DLIToOAF.convertClinicalTrial) + .filter(p => p != null)) + .write.mode(SaveMode.Overwrite).save(s"$workingPath/clinicalTrials") + + val ct:Dataset[(String,String)] = spark.read.load(s"$workingPath/clinicalTrials").as[(String,String)] + + val relDS= spark.read.load(s"$workingPath/relationDS_f1").as[Relation] + + relDS.joinWith(ct, relDS("target").equalTo(ct("_1")), "inner") + .map(k =>{ + val currentRel = k._1 + currentRel.setTarget(k._2._2) + currentRel + }).write.mode(SaveMode.Overwrite).save(s"$workingPath/clinicalTrialsRels") + val clRels:Dataset[Relation] = spark.read.load(s"$workingPath/clinicalTrialsRels").as[Relation] + val rels:Dataset[Relation] = spark.read.load(s"$workingPath/relationDS_filtered").as[Relation] + + rels.union(clRels).flatMap(r => { + val inverseRel = new Relation + inverseRel.setSource(r.getTarget) + inverseRel.setTarget(r.getSource) + inverseRel.setDataInfo(r.getDataInfo) + inverseRel.setCollectedfrom(r.getCollectedfrom) + inverseRel.setRelType(r.getRelType) + inverseRel.setSubRelType(r.getSubRelType) + inverseRel.setRelClass(DLIToOAF.rel_inverse(r.getRelClass)) + List(r, inverseRel) + }).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationAS") + val fRels:Dataset[(String,String)] = spark.read.load(s"$workingPath/relationAS").as[Relation].map(DLIToOAF.toActionSet) + val fpubs:Dataset[(String,String)] = spark.read.load(s"$workingPath/publicationAS").as[Publication].map(DLIToOAF.toActionSet) + val fdats:Dataset[(String,String)] = spark.read.load(s"$workingPath/datasetAS").as[OafDataset].map(DLIToOAF.toActionSet) - - - - - + fRels.union(fpubs).union(fdats).rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingPath/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec]) } + + + + + + + }