diff --git a/dhp-build/dhp-build-assembly-resources/pom.xml b/dhp-build/dhp-build-assembly-resources/pom.xml index 2a89a26fd..012ff89a3 100644 --- a/dhp-build/dhp-build-assembly-resources/pom.xml +++ b/dhp-build/dhp-build-assembly-resources/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp-build - 1.2.3-SNAPSHOT + 1.2.4-SNAPSHOT dhp-build-assembly-resources diff --git a/dhp-build/dhp-build-properties-maven-plugin/pom.xml b/dhp-build/dhp-build-properties-maven-plugin/pom.xml index 5be114e3c..256017e2c 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/pom.xml +++ b/dhp-build/dhp-build-properties-maven-plugin/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp-build - 1.2.3-SNAPSHOT + 1.2.4-SNAPSHOT dhp-build-properties-maven-plugin diff --git a/dhp-build/dhp-code-style/pom.xml b/dhp-build/dhp-code-style/pom.xml index 515ed35ce..e60e8076e 100644 --- a/dhp-build/dhp-code-style/pom.xml +++ b/dhp-build/dhp-code-style/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib.dhp dhp-code-style - 1.2.3-SNAPSHOT + 1.2.4-SNAPSHOT jar diff --git a/dhp-build/pom.xml b/dhp-build/pom.xml index d2dcbc36e..12b999b9c 100644 --- a/dhp-build/pom.xml +++ b/dhp-build/pom.xml @@ -4,7 +4,7 @@ eu.dnetlib.dhp dhp - 1.2.3-SNAPSHOT + 1.2.4-SNAPSHOT dhp-build pom diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index 0e7652dd3..0819a8bd2 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib.dhp dhp - 1.2.3-SNAPSHOT + 1.2.4-SNAPSHOT ../ diff --git a/dhp-schemas/pom.xml b/dhp-schemas/pom.xml index 56fb8ead2..2e5652b43 100644 --- a/dhp-schemas/pom.xml +++ b/dhp-schemas/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib.dhp dhp - 1.2.3-SNAPSHOT + 1.2.4-SNAPSHOT ../ diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java index a3c1610db..c5905e45b 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java @@ -1,6 +1,10 @@ package eu.dnetlib.dhp.schema.common; +import java.security.Key; + +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Qualifier; public class ModelConstants { @@ -95,6 +99,9 @@ public class ModelConstants { SYSIMPORT_CROSSWALK_ENTITYREGISTRY, SYSIMPORT_CROSSWALK_ENTITYREGISTRY, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS); + public static final KeyValue UNKNOWN_REPOSITORY = keyValue( + "10|openaire____::55045bd2a65019fd8e6741a755395c8c", "Unknown Repository"); + private static Qualifier qualifier( final String classid, final String classname, @@ -107,4 +114,12 @@ public class ModelConstants { q.setSchemename(schemename); return q; } + + private static KeyValue keyValue(String key, String value) { + KeyValue kv = new KeyValue(); + kv.setKey(key); + kv.setValue(value); + kv.setDataInfo(new DataInfo()); + return kv; + } } diff --git a/dhp-workflows/dhp-actionmanager/pom.xml b/dhp-workflows/dhp-actionmanager/pom.xml index b50c6705b..0b4d25700 100644 --- a/dhp-workflows/dhp-actionmanager/pom.xml +++ b/dhp-workflows/dhp-actionmanager/pom.xml @@ -4,7 +4,7 @@ eu.dnetlib.dhp dhp-workflows - 1.2.3-SNAPSHOT + 1.2.4-SNAPSHOT dhp-actionmanager diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java index 17bfc4af3..5fa9e6723 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java @@ -4,6 +4,7 @@ package eu.dnetlib.dhp.actionmanager.promote; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; +import java.io.IOException; import java.util.Objects; import java.util.Optional; import java.util.function.BiFunction; @@ -20,6 +21,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.exc.UnrecognizedPropertyException; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; @@ -134,24 +136,39 @@ public class PromoteActionPayloadForGraphTableJob { .map( (MapFunction) value -> OBJECT_MAPPER.readValue(value, rowClazz), Encoders.bean(rowClazz)); - - /* - * return spark .read() .parquet(path) .as(Encoders.bean(rowClazz)); - */ } private static Dataset readActionPayload( SparkSession spark, String path, Class actionPayloadClazz) { logger.info("Reading action payload from path: {}", path); + return spark .read() .parquet(path) + .map((MapFunction) value -> extractPayload(value), Encoders.STRING()) .map( - (MapFunction) value -> OBJECT_MAPPER - .readValue(value. getAs("payload"), actionPayloadClazz), + (MapFunction) value -> decodePayload(actionPayloadClazz, value), Encoders.bean(actionPayloadClazz)); } + private static String extractPayload(Row value) { + try { + return value. getAs("payload"); + } catch (IllegalArgumentException | ClassCastException e) { + logger.error("cannot extract payload from action: {}", value.toString()); + throw e; + } + } + + private static A decodePayload(Class actionPayloadClazz, String payload) throws IOException { + try { + return OBJECT_MAPPER.readValue(payload, actionPayloadClazz); + } catch (UnrecognizedPropertyException e) { + logger.error("error decoding payload: {}", payload); + throw e; + } + } + private static Dataset promoteActionPayloadForGraphTable( Dataset rowDS, Dataset actionPayloadDS, diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index c04910a58..a1bc1c483 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -4,7 +4,7 @@ eu.dnetlib.dhp dhp-workflows - 1.2.3-SNAPSHOT + 1.2.4-SNAPSHOT dhp-aggregation diff --git a/dhp-workflows/dhp-blacklist/pom.xml b/dhp-workflows/dhp-blacklist/pom.xml index 04d334cd7..9c25f7b29 100644 --- a/dhp-workflows/dhp-blacklist/pom.xml +++ b/dhp-workflows/dhp-blacklist/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.4-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-broker-events/pom.xml b/dhp-workflows/dhp-broker-events/pom.xml index f943ac93a..424015a3c 100644 --- a/dhp-workflows/dhp-broker-events/pom.xml +++ b/dhp-workflows/dhp-broker-events/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.4-SNAPSHOT 4.0.0 @@ -53,7 +53,7 @@ eu.dnetlib dnet-openaire-broker-common - [3.0.2,4.0.0) + [3.0.3,4.0.0) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java index bf4f62d24..6e38f7448 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java @@ -11,7 +11,7 @@ import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.time.DateUtils; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; public class EventFactory { @@ -49,8 +49,8 @@ public class EventFactory { private static Map createMapFromResult(final UpdateInfo updateInfo) { final Map map = new HashMap<>(); - final OpenaireBrokerResult source = updateInfo.getSource(); - final OpenaireBrokerResult target = updateInfo.getTarget(); + final OaBrokerMainEntity source = updateInfo.getSource(); + final OaBrokerMainEntity target = updateInfo.getTarget(); map.put("target_datasource_id", target.getCollectedFromId()); map.put("target_datasource_name", target.getCollectedFromName()); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java deleted file mode 100644 index 62171ac61..000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java +++ /dev/null @@ -1,229 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - -import java.util.Optional; - -import org.apache.commons.io.IOUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.TypedColumn; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.broker.model.Event; -import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; -import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; -import eu.dnetlib.dhp.broker.oa.util.EventFinder; -import eu.dnetlib.dhp.broker.oa.util.EventGroup; -import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultAggregator; -import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.OpenaireBrokerResultAggregator; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedEntityFactory; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.schema.oaf.Project; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.oaf.Result; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import eu.dnetlib.pace.config.DedupConfig; -import scala.Tuple2; - -public class GenerateEventsApplication { - - private static final Logger log = LoggerFactory.getLogger(GenerateEventsApplication.class); - - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - - public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - GenerateEventsApplication.class - .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_broker_events.json"))); - parser.parseArgument(args); - - final Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - - final String graphPath = parser.get("graphPath"); - log.info("graphPath: {}", graphPath); - - final String eventsPath = parser.get("eventsPath"); - log.info("eventsPath: {}", eventsPath); - - final String isLookupUrl = parser.get("isLookupUrl"); - log.info("isLookupUrl: {}", isLookupUrl); - - final String dedupConfigProfileId = parser.get("dedupConfProfile"); - log.info("dedupConfigProfileId: {}", dedupConfigProfileId); - - final SparkConf conf = new SparkConf(); - // conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - // conf.registerKryoClasses(BrokerConstants.getModelClasses()); - - // TODO UNCOMMENT - // final DedupConfig dedupConfig = loadDedupConfig(isLookupUrl, dedupConfigProfileId); - final DedupConfig dedupConfig = null; - - runWithSparkSession(conf, isSparkSessionManaged, spark -> { - - removeOutputDir(spark, eventsPath); - - // TODO REMOVE THIS - final Dataset projects = readPath(spark, graphPath + "/project", Project.class); - final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class) - .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) - .cache(); - relatedEntities(projects, rels, RelatedProject.class) - .write() - .mode(SaveMode.Overwrite) - .json(eventsPath); - - // TODO UNCOMMENT THIS - // spark - // .emptyDataset(Encoders.bean(Event.class)) - // .union(generateEvents(spark, graphPath, Publication.class, dedupConfig)) - // .union(generateEvents(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class, dedupConfig)) - // .union(generateEvents(spark, graphPath, Software.class, dedupConfig)) - // .union(generateEvents(spark, graphPath, OtherResearchProduct.class, dedupConfig)) - // .write() - // .mode(SaveMode.Overwrite) - // .option("compression", "gzip") - // .json(eventsPath); - }); - - } - - private static void removeOutputDir(final SparkSession spark, final String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } - - private static Dataset generateEvents( - final SparkSession spark, - final String graphPath, - final Class sourceClass, - final DedupConfig dedupConfig) { - - final Dataset results = expandResultsWithRelations(spark, graphPath, sourceClass); - - final Dataset mergedRels = readPath(spark, graphPath + "/relation", Relation.class) - .filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)); - - final TypedColumn, ResultGroup> aggr = new ResultAggregator() - .toColumn(); - - return results - .joinWith(mergedRels, results.col("openaireId").equalTo(mergedRels.col("source")), "inner") - .groupByKey( - (MapFunction, String>) t -> t._2.getTarget(), Encoders.STRING()) - .agg(aggr) - .map((MapFunction, ResultGroup>) t -> t._2, Encoders.bean(ResultGroup.class)) - .filter(rg -> rg.getData().size() > 1) - .map( - (MapFunction) g -> EventFinder.generateEvents(g, dedupConfig), - Encoders.bean(EventGroup.class)) - .flatMap(group -> group.getData().iterator(), Encoders.bean(Event.class)); - } - - private static Dataset expandResultsWithRelations( - final SparkSession spark, - final String graphPath, - final Class sourceClass) { - - final Dataset projects = readPath(spark, graphPath + "/project", Project.class); - // final Dataset datasets = readPath( - // spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class); - // final Dataset softwares = readPath(spark, graphPath + "/software", Software.class); - // final Dataset publications = readPath(spark, graphPath + "/publication", Publication.class); - - final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class) - .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) - .cache(); - - final Dataset r0 = readPath( - spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass) - .filter(r -> r.getDataInfo().getDeletedbyinference()) - .map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OpenaireBrokerResult.class)); - - // TODO UNCOMMENT THIS - final Dataset r1 = join(r0, rels, relatedEntities(projects, rels, RelatedProject.class)); - // final Dataset r2 = join(r1, rels, relatedEntities(softwares, rels, - // RelatedSoftware.class)); - // final Dataset r3 = join(r2, rels, relatedEntities(datasets, rels, - // RelatedDataset.class)); - // final Dataset r4 = join(r3, rels, relatedEntities(publications, rels, - // RelatedPublication.class));; - - return r0; // TODO it should be r4 - } - - private static Dataset relatedEntities(final Dataset targets, - final Dataset rels, - final Class clazz) { - return rels - .joinWith(targets, targets.col("id").equalTo(rels.col("target")), "inner") - .map( - t -> RelatedEntityFactory.newRelatedEntity(t._1.getSource(), t._1.getRelType(), t._2, clazz), - Encoders.bean(clazz)); - } - - private static Dataset join(final Dataset sources, - final Dataset rels, - final Dataset typedRels) { - - final TypedColumn, OpenaireBrokerResult> aggr = new OpenaireBrokerResultAggregator() - .toColumn(); - - return sources - .joinWith(typedRels, sources.col("openaireId").equalTo(rels.col("source")), "left_outer") - .groupByKey( - (MapFunction, String>) t -> t._1.getOpenaireId(), Encoders.STRING()) - .agg(aggr) - .map(t -> t._2, Encoders.bean(OpenaireBrokerResult.class)); - - } - - public static Dataset readPath( - final SparkSession spark, - final String inputPath, - final Class clazz) { - return spark - .read() - .textFile(inputPath) - .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); - } - - private static DedupConfig loadDedupConfig(final String isLookupUrl, final String profId) throws Exception { - - final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl); - - final String conf = isLookUpService - .getResourceProfileByQuery( - String - .format( - "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", - profId)); - - final DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class); - dedupConfig.getPace().initModel(); - dedupConfig.getPace().initTranslationMap(); - // dedupConfig.getWf().setConfigurationId("???"); - - return dedupConfig; - } - -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java new file mode 100644 index 000000000..dbe2fdd47 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java @@ -0,0 +1,103 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.model.Event; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.EventFinder; +import eu.dnetlib.dhp.broker.oa.util.EventGroup; +import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import eu.dnetlib.pace.config.DedupConfig; + +public class GenerateEventsJob { + + private static final Logger log = LoggerFactory.getLogger(GenerateEventsJob.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GenerateEventsJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_events.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String isLookupUrl = parser.get("isLookupUrl"); + log.info("isLookupUrl: {}", isLookupUrl); + + final String dedupConfigProfileId = parser.get("dedupConfProfile"); + log.info("dedupConfigProfileId: {}", dedupConfigProfileId); + + final String eventsPath = workingPath + "/events"; + log.info("eventsPath: {}", eventsPath); + + final SparkConf conf = new SparkConf(); + + // TODO UNCOMMENT + // final DedupConfig dedupConfig = loadDedupConfig(isLookupUrl, dedupConfigProfileId); + final DedupConfig dedupConfig = null; + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, eventsPath); + + final Dataset groups = ClusterUtils + .readPath(spark, workingPath + "/duplicates", ResultGroup.class); + + final Dataset events = groups + .map( + (MapFunction) g -> EventFinder.generateEvents(g, dedupConfig), + Encoders.bean(EventGroup.class)) + .flatMap(group -> group.getData().iterator(), Encoders.bean(Event.class)); + + events.write().mode(SaveMode.Overwrite).json(eventsPath); + + }); + + } + + private static DedupConfig loadDedupConfig(final String isLookupUrl, final String profId) throws Exception { + + final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl); + + final String conf = isLookUpService + .getResourceProfileByQuery( + String + .format( + "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", + profId)); + + final DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class); + dedupConfig.getPace().initModel(); + dedupConfig.getPace().initTranslationMap(); + // dedupConfig.getWf().setConfigurationId("???"); + + return dedupConfig; + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinEntitiesJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinEntitiesJob.java new file mode 100644 index 000000000..868faa8f5 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinEntitiesJob.java @@ -0,0 +1,90 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.expressions.Aggregator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProjectAggregator; +import scala.Tuple2; + +public class JoinEntitiesJob { + + private static final Logger log = LoggerFactory.getLogger(JoinEntitiesJob.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + JoinEntitiesJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String joinedEntitiesPath = workingPath + "/joinedEntities"; + log.info("joinedEntitiesPath: {}", joinedEntitiesPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, joinedEntitiesPath); + + final Dataset r0 = ClusterUtils + .readPath(spark, workingPath + "/simpleEntities", OaBrokerMainEntity.class); + + final Dataset r1 = join( + r0, ClusterUtils.readPath(spark, workingPath + "/relatedProjects", RelatedProject.class), + new RelatedProjectAggregator()); + // final Dataset r2 = join( + // r1, ClusterUtils.readPath(spark, workingPath + "/relatedDatasets", RelatedDataset.class), new + // RelatedDatasetAggregator()); + // final Dataset r3 = join( + // r2, ClusterUtils.readPath(spark, workingPath + "/relatedPublications", RelatedPublication.class), new + // RelatedPublicationAggregator()); + // final Dataset r4 = join( + // r3, ClusterUtils.readPath(spark, workingPath + "/relatedSoftwares", RelatedSoftware.class), new + // RelatedSoftwareAggregator()); + + r1.write().mode(SaveMode.Overwrite).json(joinedEntitiesPath); + + }); + + } + + private static Dataset join(final Dataset sources, + final Dataset typedRels, + final Aggregator, OaBrokerMainEntity, OaBrokerMainEntity> aggr) { + + return sources + .joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer") + .groupByKey( + (MapFunction, String>) t -> t._1.getOpenaireId(), Encoders.STRING()) + .agg(aggr.toColumn()) + .map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class)); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java new file mode 100644 index 000000000..934ddff59 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java @@ -0,0 +1,88 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.TypedColumn; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultAggregator; +import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup; +import eu.dnetlib.dhp.schema.oaf.Relation; +import scala.Tuple2; + +public class PrepareGroupsJob { + + private static final Logger log = LoggerFactory.getLogger(PrepareGroupsJob.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + JoinEntitiesJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String graphPath = parser.get("graphPath"); + log.info("graphPath: {}", graphPath); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String groupsPath = workingPath + "/duplicates"; + log.info("groupsPath: {}", groupsPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, groupsPath); + + final Dataset results = ClusterUtils + .readPath(spark, workingPath + "/joinedEntities", OaBrokerMainEntity.class); + + final Dataset mergedRels = ClusterUtils + .readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)); + + final TypedColumn, ResultGroup> aggr = new ResultAggregator() + .toColumn(); + + final Dataset groups = results + .joinWith(mergedRels, results.col("openaireId").equalTo(mergedRels.col("source")), "inner") + .groupByKey( + (MapFunction, String>) t -> t._2.getTarget(), + Encoders.STRING()) + .agg(aggr) + .map( + (MapFunction, ResultGroup>) t -> t._2, Encoders.bean(ResultGroup.class)) + .filter(rg -> rg.getData().size() > 1); + + groups + .write() + .mode(SaveMode.Overwrite) + .json(groupsPath); + + }); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java new file mode 100644 index 000000000..fe9c87e87 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java @@ -0,0 +1,84 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.broker.objects.OaBrokerRelatedDataset; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDataset; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class PrepareRelatedDatasetsJob { + + private static final Logger log = LoggerFactory.getLogger(PrepareRelatedDatasetsJob.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + PrepareRelatedDatasetsJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String graphPath = parser.get("graphPath"); + log.info("graphPath: {}", graphPath); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String relsPath = workingPath + "/relatedDatasets"; + log.info("relsPath: {}", relsPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, relsPath); + + final Dataset datasets = ClusterUtils + .readPath(spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class) + .filter(d -> !ClusterUtils.isDedupRoot(d.getId())) + .map(ConversionUtils::oafDatasetToBrokerDataset, Encoders.bean(OaBrokerRelatedDataset.class)); + + final Dataset rels = ClusterUtils + .readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT)) + .filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass())) + .filter(r -> !ClusterUtils.isDedupRoot(r.getSource())) + .filter(r -> !ClusterUtils.isDedupRoot(r.getTarget())); + + rels + .joinWith(datasets, datasets.col("openaireId").equalTo(rels.col("target")), "inner") + .map(t -> { + final RelatedDataset rel = new RelatedDataset(t._1.getSource(), t._2); + rel.getRelDataset().setRelType(t._1.getRelClass()); + return rel; + }, Encoders.bean(RelatedDataset.class)) + .write() + .mode(SaveMode.Overwrite) + .json(relsPath); + + }); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java new file mode 100644 index 000000000..3ae240982 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java @@ -0,0 +1,85 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.broker.objects.OaBrokerProject; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class PrepareRelatedProjectsJob { + + private static final Logger log = LoggerFactory.getLogger(PrepareRelatedProjectsJob.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + PrepareRelatedProjectsJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String graphPath = parser.get("graphPath"); + log.info("graphPath: {}", graphPath); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String relsPath = workingPath + "/relatedProjects"; + log.info("relsPath: {}", relsPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, relsPath); + + final Dataset projects = ClusterUtils + .readPath(spark, graphPath + "/project", Project.class) + .filter(p -> !ClusterUtils.isDedupRoot(p.getId())) + .map(ConversionUtils::oafProjectToBrokerProject, Encoders.bean(OaBrokerProject.class)); + + final Dataset rels = ClusterUtils + .readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT)) + .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) + .filter(r -> !ClusterUtils.isDedupRoot(r.getSource())) + .filter(r -> !ClusterUtils.isDedupRoot(r.getTarget())); + + rels + .joinWith(projects, projects.col("openaireId").equalTo(rels.col("target")), "inner") + .map(t -> new RelatedProject(t._1.getSource(), t._2), Encoders.bean(RelatedProject.class)) + .write() + .mode(SaveMode.Overwrite) + .json(relsPath); + }); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java new file mode 100644 index 000000000..8814ef3e0 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java @@ -0,0 +1,91 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.broker.objects.OaBrokerRelatedPublication; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublication; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class PrepareRelatedPublicationsJob { + + private static final Logger log = LoggerFactory.getLogger(PrepareRelatedPublicationsJob.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + PrepareRelatedPublicationsJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String graphPath = parser.get("graphPath"); + log.info("graphPath: {}", graphPath); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String relsPath = workingPath + "/relatedPublications"; + log.info("relsPath: {}", relsPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, relsPath); + + final Dataset pubs = ClusterUtils + .readPath(spark, graphPath + "/publication", Publication.class) + .filter(p -> !ClusterUtils.isDedupRoot(p.getId())) + .map( + ConversionUtils::oafPublicationToBrokerPublication, + Encoders.bean(OaBrokerRelatedPublication.class)); + + final Dataset rels = ClusterUtils + .readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT)) + .filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass())) + .filter(r -> !ClusterUtils.isDedupRoot(r.getSource())) + .filter(r -> !ClusterUtils.isDedupRoot(r.getTarget())); + + rels + .joinWith(pubs, pubs.col("openaireId").equalTo(rels.col("target")), "inner") + .map(t -> { + final RelatedPublication rel = new RelatedPublication(t._1.getSource(), t._2); + rel.getRelPublication().setRelType(t._1.getRelClass()); + return rel; + }, Encoders.bean(RelatedPublication.class)) + .write() + .mode(SaveMode.Overwrite) + .json(relsPath); + + }); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java new file mode 100644 index 000000000..0704fb44a --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java @@ -0,0 +1,86 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Software; + +public class PrepareRelatedSoftwaresJob { + + private static final Logger log = LoggerFactory.getLogger(PrepareRelatedSoftwaresJob.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + PrepareRelatedSoftwaresJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String graphPath = parser.get("graphPath"); + log.info("graphPath: {}", graphPath); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String relsPath = workingPath + "/relatedSoftwares"; + log.info("relsPath: {}", relsPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, relsPath); + + final Dataset softwares = ClusterUtils + .readPath(spark, graphPath + "/software", Software.class) + .filter(sw -> !ClusterUtils.isDedupRoot(sw.getId())) + .map(ConversionUtils::oafSoftwareToBrokerSoftware, Encoders.bean(OaBrokerRelatedSoftware.class)); + + final Dataset rels = ClusterUtils + .readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT)) + .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) + .filter(r -> !ClusterUtils.isDedupRoot(r.getSource())) + .filter(r -> !ClusterUtils.isDedupRoot(r.getTarget())); + + rels + .joinWith(softwares, softwares.col("openaireId").equalTo(rels.col("target")), "inner") + .map(t -> new RelatedSoftware(t._1.getSource(), t._2), Encoders.bean(RelatedSoftware.class)) + .write() + .mode(SaveMode.Overwrite) + .json(relsPath); + + }); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java new file mode 100644 index 000000000..1b9c279fd --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java @@ -0,0 +1,82 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.Software; + +public class PrepareSimpleEntititiesJob { + + private static final Logger log = LoggerFactory.getLogger(PrepareSimpleEntititiesJob.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + PrepareSimpleEntititiesJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String graphPath = parser.get("graphPath"); + log.info("graphPath: {}", graphPath); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String simpleEntitiesPath = workingPath + "/simpleEntities"; + log.info("simpleEntitiesPath: {}", simpleEntitiesPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + ClusterUtils.removeDir(spark, simpleEntitiesPath); + + prepareSimpleEntities(spark, graphPath, Publication.class) + .union(prepareSimpleEntities(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class)) + .union(prepareSimpleEntities(spark, graphPath, Software.class)) + .union(prepareSimpleEntities(spark, graphPath, OtherResearchProduct.class)) + .write() + .mode(SaveMode.Overwrite) + .json(simpleEntitiesPath); + }); + + } + + private static Dataset prepareSimpleEntities( + final SparkSession spark, + final String graphPath, + final Class sourceClass) { + + return ClusterUtils + .readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass) + .filter(r -> !ClusterUtils.isDedupRoot(r.getId())) + .filter(r -> r.getDataInfo().getDeletedbyinference()) + .map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OaBrokerMainEntity.class)); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java index 13aeefb2f..9aa6f5384 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java @@ -12,7 +12,7 @@ import java.util.function.Function; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.pace.config.DedupConfig; @@ -21,11 +21,11 @@ public abstract class UpdateMatcher { private final boolean multipleUpdate; private final Function topicFunction; - private final BiConsumer compileHighlightFunction; + private final BiConsumer compileHighlightFunction; private final Function highlightToStringFunction; public UpdateMatcher(final boolean multipleUpdate, final Function topicFunction, - final BiConsumer compileHighlightFunction, + final BiConsumer compileHighlightFunction, final Function highlightToStringFunction) { this.multipleUpdate = multipleUpdate; this.topicFunction = topicFunction; @@ -33,13 +33,13 @@ public abstract class UpdateMatcher { this.highlightToStringFunction = highlightToStringFunction; } - public Collection> searchUpdatesForRecord(final OpenaireBrokerResult res, - final Collection others, + public Collection> searchUpdatesForRecord(final OaBrokerMainEntity res, + final Collection others, final DedupConfig dedupConfig) { final Map> infoMap = new HashMap<>(); - for (final OpenaireBrokerResult source : others) { + for (final OaBrokerMainEntity source : others) { if (source != res) { for (final T hl : findDifferences(source, res)) { final Topic topic = getTopicFunction().apply(hl); @@ -68,7 +68,7 @@ public abstract class UpdateMatcher { } } - protected abstract List findDifferences(OpenaireBrokerResult source, OpenaireBrokerResult target); + protected abstract List findDifferences(OaBrokerMainEntity source, OaBrokerMainEntity target); protected static boolean isMissing(final List list) { return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0)); @@ -86,7 +86,7 @@ public abstract class UpdateMatcher { return topicFunction; } - public BiConsumer getCompileHighlightFunction() { + public BiConsumer getCompileHighlightFunction() { return compileHighlightFunction; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java index 7a58f986b..c197734a3 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java @@ -5,13 +5,12 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.Dataset; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerRelatedDataset; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public abstract class AbstractEnrichMissingDataset - extends UpdateMatcher { +public abstract class AbstractEnrichMissingDataset extends UpdateMatcher { public AbstractEnrichMissingDataset(final Topic topic) { super(true, @@ -23,14 +22,14 @@ public abstract class AbstractEnrichMissingDataset protected abstract boolean filterByType(String relType); @Override - protected final List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected final List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingDatasets = target .getDatasets() .stream() .filter(rel -> filterByType(rel.getRelType())) - .map(Dataset::getOriginalId) + .map(OaBrokerRelatedDataset::getOriginalId) .collect(Collectors.toSet()); return source diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java index fa5fde725..49c546bba 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java @@ -4,12 +4,12 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedProjects; import java.util.ArrayList; import java.util.List; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.Project; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerProject; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMissingProject extends UpdateMatcher { +public class EnrichMissingProject extends UpdateMatcher { public EnrichMissingProject() { super(true, @@ -19,7 +19,7 @@ public class EnrichMissingProject extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) { if (target.getProjects().isEmpty()) { return source.getProjects(); } else { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java index ca63aeb49..6954a3fb5 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java @@ -5,12 +5,12 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.Project; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerProject; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMoreProject extends UpdateMatcher { +public class EnrichMoreProject extends UpdateMatcher { public EnrichMoreProject() { super(true, @@ -19,13 +19,13 @@ public class EnrichMoreProject extends UpdateMatcher { prj -> projectAsString(prj)); } - private static String projectAsString(final Project prj) { + private static String projectAsString(final OaBrokerProject prj) { return prj.getFunder() + "::" + prj.getFundingProgram() + "::" + prj.getCode(); } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingProjects = target .getProjects() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java index 300863949..ad6d8263b 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java @@ -5,12 +5,12 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.Publication; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerRelatedPublication; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public abstract class AbstractEnrichMissingPublication extends UpdateMatcher { +public abstract class AbstractEnrichMissingPublication extends UpdateMatcher { public AbstractEnrichMissingPublication(final Topic topic) { super(true, @@ -23,15 +23,15 @@ public abstract class AbstractEnrichMissingPublication extends UpdateMatcher findDifferences( - final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected final List findDifferences( + final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingPublications = target .getPublications() .stream() .filter(rel -> filterByType(rel.getRelType())) - .map(Publication::getOriginalId) + .map(OaBrokerRelatedPublication::getOriginalId) .collect(Collectors.toSet()); return source diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java index 76ae061e6..452caa503 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java @@ -4,12 +4,13 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware; import java.util.ArrayList; import java.util.List; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; public class EnrichMissingSoftware - extends UpdateMatcher { + extends UpdateMatcher { public EnrichMissingSoftware() { super(true, @@ -19,9 +20,9 @@ public class EnrichMissingSoftware } @Override - protected List findDifferences( - final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences( + final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { if (target.getSoftwares().isEmpty()) { return source.getSoftwares(); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java index ebd421b8e..aaffe1249 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java @@ -5,12 +5,12 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.Software; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMoreSoftware extends UpdateMatcher { +public class EnrichMoreSoftware extends UpdateMatcher { public EnrichMoreSoftware() { super(true, @@ -20,14 +20,14 @@ public class EnrichMoreSoftware extends UpdateMatcher { } @Override - protected List findDifferences( - final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences( + final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingSoftwares = source .getSoftwares() .stream() - .map(Software::getName) + .map(OaBrokerRelatedSoftware::getName) .collect(Collectors.toSet()); return target diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java index b2cbbce2c..73462bae8 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java @@ -5,7 +5,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; @@ -19,7 +19,7 @@ public class EnrichMissingAbstract extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) { if (isMissing(target.getAbstracts()) && !isMissing(source.getAbstracts())) { return Arrays.asList(source.getAbstracts().get(0)); } else { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java index c4b96e67b..2a01188a9 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java @@ -7,12 +7,12 @@ import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; -import eu.dnetlib.broker.objects.Author; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerAuthor; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMissingAuthorOrcid extends UpdateMatcher { +public class EnrichMissingAuthorOrcid extends UpdateMatcher { public EnrichMissingAuthorOrcid() { super(true, @@ -22,13 +22,13 @@ public class EnrichMissingAuthorOrcid extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingOrcids = target .getCreators() .stream() - .map(Author::getOrcid) + .map(OaBrokerAuthor::getOrcid) .filter(StringUtils::isNotBlank) .collect(Collectors.toSet()); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java index e870cf1fa..487382957 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java @@ -5,28 +5,28 @@ import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.Instance; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerInstance; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; -public class EnrichMissingOpenAccess extends UpdateMatcher { +public class EnrichMissingOpenAccess extends UpdateMatcher { public EnrichMissingOpenAccess() { super(true, i -> Topic.ENRICH_MISSING_OA_VERSION, (p, i) -> p.getInstances().add(i), - Instance::getUrl); + OaBrokerInstance::getUrl); } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final long count = target .getInstances() .stream() - .map(Instance::getLicense) + .map(OaBrokerInstance::getLicense) .filter(right -> right.equals(BrokerConstants.OPEN_ACCESS)) .count(); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java index cc72d9fa9..ee1617b1e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java @@ -5,12 +5,12 @@ import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.TypedValue; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerTypedValue; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMissingPid extends UpdateMatcher { +public class EnrichMissingPid extends UpdateMatcher { public EnrichMissingPid() { super(true, @@ -20,8 +20,8 @@ public class EnrichMissingPid extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final long count = target.getPids().size(); if (count > 0) { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java index ed8c26b5a..2c0533fa3 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java @@ -5,7 +5,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; @@ -19,8 +19,8 @@ public class EnrichMissingPublicationDate extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { if (isMissing(target.getPublicationdate()) && !isMissing(source.getPublicationdate())) { return Arrays.asList(source.getPublicationdate()); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java index 07b1fa41a..9ab9fce48 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java @@ -5,12 +5,12 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.TypedValue; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerTypedValue; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMissingSubject extends UpdateMatcher { +public class EnrichMissingSubject extends UpdateMatcher { public EnrichMissingSubject() { super(true, @@ -20,8 +20,8 @@ public class EnrichMissingSubject extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingSubject = target .getSubjects() .stream() @@ -35,7 +35,7 @@ public class EnrichMissingSubject extends UpdateMatcher { .collect(Collectors.toList()); } - private static String subjectAsString(final TypedValue s) { + private static String subjectAsString(final OaBrokerTypedValue s) { return s.getType() + "::" + s.getValue(); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java index bfef3ee4f..e90a8f201 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java @@ -5,24 +5,24 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.Instance; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerInstance; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; -public class EnrichMoreOpenAccess extends UpdateMatcher { +public class EnrichMoreOpenAccess extends UpdateMatcher { public EnrichMoreOpenAccess() { super(true, i -> Topic.ENRICH_MORE_OA_VERSION, (p, i) -> p.getInstances().add(i), - Instance::getUrl); + OaBrokerInstance::getUrl); } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set urls = target .getInstances() .stream() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java index d1f2e6022..43b4f0628 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java @@ -5,12 +5,12 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.TypedValue; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerTypedValue; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMorePid extends UpdateMatcher { +public class EnrichMorePid extends UpdateMatcher { public EnrichMorePid() { super(true, @@ -20,8 +20,8 @@ public class EnrichMorePid extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingPids = target .getPids() .stream() @@ -35,7 +35,7 @@ public class EnrichMorePid extends UpdateMatcher { .collect(Collectors.toList()); } - private static String pidAsString(final TypedValue pid) { + private static String pidAsString(final OaBrokerTypedValue pid) { return pid.getType() + "::" + pid.getValue(); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java index 39225e8ab..04fb494ef 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java @@ -5,12 +5,12 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.TypedValue; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerTypedValue; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; -public class EnrichMoreSubject extends UpdateMatcher { +public class EnrichMoreSubject extends UpdateMatcher { public EnrichMoreSubject() { super(true, @@ -20,8 +20,8 @@ public class EnrichMoreSubject extends UpdateMatcher { } @Override - protected List findDifferences(final OpenaireBrokerResult source, - final OpenaireBrokerResult target) { + protected List findDifferences(final OaBrokerMainEntity source, + final OaBrokerMainEntity target) { final Set existingSubjects = target .getSubjects() .stream() @@ -35,7 +35,7 @@ public class EnrichMoreSubject extends UpdateMatcher { .collect(Collectors.toList()); } - private static String subjectAsString(final TypedValue s) { + private static String subjectAsString(final OaBrokerTypedValue s) { return s.getType() + "::" + s.getValue(); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java new file mode 100644 index 000000000..de9b901d0 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java @@ -0,0 +1,47 @@ + +package eu.dnetlib.dhp.broker.oa.util; + +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.common.HdfsSupport; + +public class ClusterUtils { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void createDirIfMissing(final SparkSession spark, final String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } + + public static void removeDir(final SparkSession spark, final String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } + + public static Dataset readPath( + final SparkSession spark, + final String inputPath, + final Class clazz) { + return spark + .read() + .textFile(inputPath) + .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); + } + + public static boolean isDedupRoot(final String id) { + return id.contains("dedup_wf_"); + } + + public static final boolean isValidResultResultClass(final String s) { + return s.equals("isReferencedBy") + || s.equals("isRelatedTo") + || s.equals("references") + || s.equals("isSupplementedBy") + || s.equals("isSupplementedTo"); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java index 730d06519..b61d5e7cc 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java @@ -15,8 +15,16 @@ import org.slf4j.LoggerFactory; import com.google.common.base.Function; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.TypedValue; +import eu.dnetlib.broker.objects.OaBrokerAuthor; +import eu.dnetlib.broker.objects.OaBrokerExternalReference; +import eu.dnetlib.broker.objects.OaBrokerInstance; +import eu.dnetlib.broker.objects.OaBrokerJournal; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerProject; +import eu.dnetlib.broker.objects.OaBrokerRelatedDataset; +import eu.dnetlib.broker.objects.OaBrokerRelatedPublication; +import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware; +import eu.dnetlib.broker.objects.OaBrokerTypedValue; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.ExternalReference; @@ -35,13 +43,13 @@ public class ConversionUtils { private static final Logger log = LoggerFactory.getLogger(ConversionUtils.class); - public static List oafInstanceToBrokerInstances(final Instance i) { + public static List oafInstanceToBrokerInstances(final Instance i) { if (i == null) { return new ArrayList<>(); } return mappedList(i.getUrl(), url -> { - final eu.dnetlib.broker.objects.Instance res = new eu.dnetlib.broker.objects.Instance(); + final OaBrokerInstance res = new OaBrokerInstance(); res.setUrl(url); res.setInstancetype(classId(i.getInstancetype())); res.setLicense(BrokerConstants.OPEN_ACCESS); @@ -50,20 +58,21 @@ public class ConversionUtils { }); } - public static TypedValue oafPidToBrokerPid(final StructuredProperty sp) { + public static OaBrokerTypedValue oafPidToBrokerPid(final StructuredProperty sp) { return oafStructPropToBrokerTypedValue(sp); } - public static TypedValue oafStructPropToBrokerTypedValue(final StructuredProperty sp) { - return sp != null ? new TypedValue(classId(sp.getQualifier()), sp.getValue()) : null; + public static OaBrokerTypedValue oafStructPropToBrokerTypedValue(final StructuredProperty sp) { + return sp != null ? new OaBrokerTypedValue(classId(sp.getQualifier()), sp.getValue()) : null; } - public static final eu.dnetlib.broker.objects.Dataset oafDatasetToBrokerDataset(final Dataset d) { + public static final OaBrokerRelatedDataset oafDatasetToBrokerDataset(final Dataset d) { if (d == null) { return null; } - final eu.dnetlib.broker.objects.Dataset res = new eu.dnetlib.broker.objects.Dataset(); + final OaBrokerRelatedDataset res = new OaBrokerRelatedDataset(); + res.setOpenaireId(d.getId()); res.setOriginalId(first(d.getOriginalId())); res.setTitle(structPropValue(d.getTitle())); res.setPids(mappedList(d.getPid(), ConversionUtils::oafPidToBrokerPid)); @@ -72,12 +81,13 @@ public class ConversionUtils { return res; } - public static eu.dnetlib.broker.objects.Publication oafPublicationToBrokerPublication(final Publication p) { + public static OaBrokerRelatedPublication oafPublicationToBrokerPublication(final Publication p) { if (p == null) { return null; } - final eu.dnetlib.broker.objects.Publication res = new eu.dnetlib.broker.objects.Publication(); + final OaBrokerRelatedPublication res = new OaBrokerRelatedPublication(); + res.setOpenaireId(p.getId()); res.setOriginalId(first(p.getOriginalId())); res.setTitle(structPropValue(p.getTitle())); res.setPids(mappedList(p.getPid(), ConversionUtils::oafPidToBrokerPid)); @@ -87,12 +97,12 @@ public class ConversionUtils { return res; } - public static final OpenaireBrokerResult oafResultToBrokerResult(final Result result) { + public static final OaBrokerMainEntity oafResultToBrokerResult(final Result result) { if (result == null) { return null; } - final OpenaireBrokerResult res = new OpenaireBrokerResult(); + final OaBrokerMainEntity res = new OaBrokerMainEntity(); res.setOpenaireId(result.getId()); res.setOriginalId(first(result.getOriginalId())); @@ -118,7 +128,7 @@ public class ConversionUtils { return res; } - private static eu.dnetlib.broker.objects.Author oafAuthorToBrokerAuthor(final Author author) { + private static OaBrokerAuthor oafAuthorToBrokerAuthor(final Author author) { if (author == null) { return null; } @@ -135,15 +145,15 @@ public class ConversionUtils { .findFirst() .orElse(null) : null; - return new eu.dnetlib.broker.objects.Author(author.getFullname(), pids); + return new OaBrokerAuthor(author.getFullname(), pids); } - private static eu.dnetlib.broker.objects.Journal oafJournalToBrokerJournal(final Journal journal) { + private static OaBrokerJournal oafJournalToBrokerJournal(final Journal journal) { if (journal == null) { return null; } - final eu.dnetlib.broker.objects.Journal res = new eu.dnetlib.broker.objects.Journal(); + final OaBrokerJournal res = new OaBrokerJournal(); res.setName(journal.getName()); res.setIssn(journal.getIssnPrinted()); res.setEissn(journal.getIssnOnline()); @@ -152,12 +162,12 @@ public class ConversionUtils { return res; } - private static eu.dnetlib.broker.objects.ExternalReference oafExtRefToBrokerExtRef(final ExternalReference ref) { + private static OaBrokerExternalReference oafExtRefToBrokerExtRef(final ExternalReference ref) { if (ref == null) { return null; } - final eu.dnetlib.broker.objects.ExternalReference res = new eu.dnetlib.broker.objects.ExternalReference(); + final OaBrokerExternalReference res = new OaBrokerExternalReference(); res.setRefidentifier(ref.getRefidentifier()); res.setSitename(ref.getSitename()); res.setType(classId(ref.getQualifier())); @@ -165,12 +175,13 @@ public class ConversionUtils { return res; } - public static final eu.dnetlib.broker.objects.Project oafProjectToBrokerProject(final Project p) { + public static final OaBrokerProject oafProjectToBrokerProject(final Project p) { if (p == null) { return null; } - final eu.dnetlib.broker.objects.Project res = new eu.dnetlib.broker.objects.Project(); + final OaBrokerProject res = new OaBrokerProject(); + res.setOpenaireId(p.getId()); res.setTitle(fieldValue(p.getTitle())); res.setAcronym(fieldValue(p.getAcronym())); res.setCode(fieldValue(p.getCode())); @@ -190,12 +201,13 @@ public class ConversionUtils { return res; } - public static final eu.dnetlib.broker.objects.Software oafSoftwareToBrokerSoftware(final Software sw) { + public static final OaBrokerRelatedSoftware oafSoftwareToBrokerSoftware(final Software sw) { if (sw == null) { return null; } - final eu.dnetlib.broker.objects.Software res = new eu.dnetlib.broker.objects.Software(); + final OaBrokerRelatedSoftware res = new OaBrokerRelatedSoftware(); + res.setOpenaireId(sw.getId()); res.setName(structPropValue(sw.getTitle())); res.setDescription(fieldValue(sw.getDescription())); res.setRepository(fieldValue(sw.getCodeRepositoryUrl())); @@ -247,7 +259,7 @@ public class ConversionUtils { : new ArrayList<>(); } - private static List structPropTypedList(final List list) { + private static List structPropTypedList(final List list) { if (list == null) { return new ArrayList<>(); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java index 4c20ac5ca..7451e5891 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java @@ -4,7 +4,7 @@ package eu.dnetlib.dhp.broker.oa.util; import java.util.ArrayList; import java.util.List; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.EventFactory; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsReferencedBy; @@ -68,7 +68,7 @@ public class EventFinder { public static EventGroup generateEvents(final ResultGroup results, final DedupConfig dedupConfig) { final List> list = new ArrayList<>(); - for (final OpenaireBrokerResult target : results.getData()) { + for (final OaBrokerMainEntity target : results.getData()) { for (final UpdateMatcher matcher : matchers) { list.addAll(matcher.searchUpdatesForRecord(target, results.getData(), dedupConfig)); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java index 2c4bda53d..25d0d2bca 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java @@ -9,10 +9,10 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.broker.objects.Instance; -import eu.dnetlib.broker.objects.OpenAireEventPayload; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import eu.dnetlib.broker.objects.Provenance; +import eu.dnetlib.broker.objects.OaBrokerEventPayload; +import eu.dnetlib.broker.objects.OaBrokerInstance; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerProvenance; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; @@ -25,11 +25,11 @@ public final class UpdateInfo { private final T highlightValue; - private final OpenaireBrokerResult source; + private final OaBrokerMainEntity source; - private final OpenaireBrokerResult target; + private final OaBrokerMainEntity target; - private final BiConsumer compileHighlight; + private final BiConsumer compileHighlight; private final Function highlightToString; @@ -37,9 +37,9 @@ public final class UpdateInfo { private static final Logger log = LoggerFactory.getLogger(UpdateInfo.class); - public UpdateInfo(final Topic topic, final T highlightValue, final OpenaireBrokerResult source, - final OpenaireBrokerResult target, - final BiConsumer compileHighlight, + public UpdateInfo(final Topic topic, final T highlightValue, final OaBrokerMainEntity source, + final OaBrokerMainEntity target, + final BiConsumer compileHighlight, final Function highlightToString, final DedupConfig dedupConfig) { this.topic = topic; @@ -55,17 +55,17 @@ public final class UpdateInfo { return highlightValue; } - public OpenaireBrokerResult getSource() { + public OaBrokerMainEntity getSource() { return source; } - public OpenaireBrokerResult getTarget() { + public OaBrokerMainEntity getTarget() { return target; } private float calculateTrust(final DedupConfig dedupConfig, - final OpenaireBrokerResult r1, - final OpenaireBrokerResult r2) { + final OaBrokerMainEntity r1, + final OaBrokerMainEntity r2) { if (dedupConfig == null) { return BrokerConstants.MIN_TRUST; @@ -104,11 +104,11 @@ public final class UpdateInfo { return highlightToString.apply(getHighlightValue()); } - public OpenAireEventPayload asBrokerPayload() { + public OaBrokerEventPayload asBrokerPayload() { compileHighlight.accept(target, getHighlightValue()); - final OpenaireBrokerResult hl = new OpenaireBrokerResult(); + final OaBrokerMainEntity hl = new OaBrokerMainEntity(); compileHighlight.accept(hl, getHighlightValue()); final String provId = getSource().getOriginalId(); @@ -117,14 +117,14 @@ public final class UpdateInfo { final String provUrl = getSource() .getInstances() .stream() - .map(Instance::getUrl) + .map(OaBrokerInstance::getUrl) .findFirst() .orElse(null); ; - final Provenance provenance = new Provenance(provId, provRepo, provUrl); + final OaBrokerProvenance provenance = new OaBrokerProvenance(provId, provRepo, provUrl); - final OpenAireEventPayload res = new OpenAireEventPayload(); + final OaBrokerEventPayload res = new OaBrokerEventPayload(); res.setResult(target); res.setHighlight(hl); res.setTrust(trust); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultAggregator.java index a46fde445..ee1c8963e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultAggregator.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultAggregator.java @@ -5,11 +5,11 @@ import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.expressions.Aggregator; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.schema.oaf.Relation; import scala.Tuple2; -public class ResultAggregator extends Aggregator, ResultGroup, ResultGroup> { +public class ResultAggregator extends Aggregator, ResultGroup, ResultGroup> { /** * @@ -22,7 +22,7 @@ public class ResultAggregator extends Aggregator t) { + public ResultGroup reduce(final ResultGroup group, final Tuple2 t) { group.getData().add(t._1); return group; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultGroup.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultGroup.java index 3f9dbe8af..e718e0f1c 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultGroup.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/simple/ResultGroup.java @@ -5,7 +5,7 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.List; -import eu.dnetlib.broker.objects.OpenaireBrokerResult; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; public class ResultGroup implements Serializable { @@ -14,13 +14,13 @@ public class ResultGroup implements Serializable { */ private static final long serialVersionUID = -3360828477088669296L; - private List data = new ArrayList<>(); + private List data = new ArrayList<>(); - public List getData() { + public List getData() { return data; } - public void setData(final List data) { + public void setData(final List data) { this.data = data; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/OpenaireBrokerResultAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/OpenaireBrokerResultAggregator.java deleted file mode 100644 index e72dcb988..000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/OpenaireBrokerResultAggregator.java +++ /dev/null @@ -1,69 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; - -import org.apache.spark.sql.Encoder; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.expressions.Aggregator; - -import eu.dnetlib.broker.objects.OpenaireBrokerResult; -import scala.Tuple2; - -public class OpenaireBrokerResultAggregator - extends Aggregator, OpenaireBrokerResult, OpenaireBrokerResult> { - - /** - * - */ - private static final long serialVersionUID = -3687878788861013488L; - - @Override - public OpenaireBrokerResult zero() { - return new OpenaireBrokerResult(); - } - - @Override - public OpenaireBrokerResult finish(final OpenaireBrokerResult g) { - return g; - } - - @Override - public OpenaireBrokerResult reduce(final OpenaireBrokerResult g, final Tuple2 t) { - if (g.getOriginalId() == null) { - return t._1; - } else if (t._2 instanceof RelatedSoftware) { - g.getSoftwares().add(((RelatedSoftware) t._2).getRelSoftware()); - } else if (t._2 instanceof RelatedDataset) { - g.getDatasets().add(((RelatedDataset) t._2).getRelDataset()); - } else if (t._2 instanceof RelatedPublication) { - g.getPublications().add(((RelatedPublication) t._2).getRelPublication()); - } else if (t._2 instanceof RelatedProject) { - g.getProjects().add(((RelatedProject) t._2).getRelProject()); - } - return g; - - } - - @Override - public OpenaireBrokerResult merge(final OpenaireBrokerResult g1, final OpenaireBrokerResult g2) { - if (g1.getOriginalId() != null) { - g1.getSoftwares().addAll(g2.getSoftwares()); - g1.getDatasets().addAll(g2.getDatasets()); - g1.getPublications().addAll(g2.getPublications()); - g1.getProjects().addAll(g2.getProjects()); - return g1; - } else { - return g2; - } - } - - @Override - public Encoder bufferEncoder() { - return Encoders.bean(OpenaireBrokerResult.class); - } - - @Override - public Encoder outputEncoder() { - return Encoders.bean(OpenaireBrokerResult.class); - } - -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java index 6a5fb258c..0925e3291 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDataset.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; import java.io.Serializable; -import eu.dnetlib.broker.objects.Dataset; +import eu.dnetlib.broker.objects.OaBrokerRelatedDataset; public class RelatedDataset implements Serializable { @@ -11,16 +11,15 @@ public class RelatedDataset implements Serializable { * */ private static final long serialVersionUID = 774487705184038324L; + private String source; - private String relType; - private Dataset relDataset; + private OaBrokerRelatedDataset relDataset; public RelatedDataset() { } - public RelatedDataset(final String source, final String relType, final Dataset relDataset) { + public RelatedDataset(final String source, final OaBrokerRelatedDataset relDataset) { this.source = source; - this.relType = relType; this.relDataset = relDataset; } @@ -32,19 +31,11 @@ public class RelatedDataset implements Serializable { this.source = source; } - public String getRelType() { - return relType; - } - - public void setRelType(final String relType) { - this.relType = relType; - } - - public Dataset getRelDataset() { + public OaBrokerRelatedDataset getRelDataset() { return relDataset; } - public void setRelDataset(final Dataset relDataset) { + public void setRelDataset(final OaBrokerRelatedDataset relDataset) { this.relDataset = relDataset; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasetAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasetAggregator.java new file mode 100644 index 000000000..a963f073d --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasetAggregator.java @@ -0,0 +1,60 @@ + +package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; + +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.expressions.Aggregator; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import scala.Tuple2; + +public class RelatedDatasetAggregator + extends Aggregator, OaBrokerMainEntity, OaBrokerMainEntity> { + + /** + * + */ + private static final long serialVersionUID = 6969761680131482557L; + + @Override + public OaBrokerMainEntity zero() { + return new OaBrokerMainEntity(); + } + + @Override + public OaBrokerMainEntity finish(final OaBrokerMainEntity g) { + return g; + } + + @Override + public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2 t) { + final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1; + if (t._2 != null) { + res.getDatasets().add(t._2.getRelDataset()); + } + return res; + + } + + @Override + public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { + if (StringUtils.isNotBlank(g1.getOpenaireId())) { + g1.getDatasets().addAll(g2.getDatasets()); + return g1; + } else { + return g2; + } + } + + @Override + public Encoder bufferEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + + @Override + public Encoder outputEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedEntityFactory.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedEntityFactory.java deleted file mode 100644 index c60d4f141..000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedEntityFactory.java +++ /dev/null @@ -1,34 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; - -import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; -import eu.dnetlib.dhp.schema.oaf.Dataset; -import eu.dnetlib.dhp.schema.oaf.Project; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.schema.oaf.Software; - -public class RelatedEntityFactory { - - @SuppressWarnings("unchecked") - public static RT newRelatedEntity(final String sourceId, - final String relType, - final T target, - final Class clazz) { - - if (clazz == RelatedProject.class) { - return (RT) new RelatedProject(sourceId, relType, - ConversionUtils.oafProjectToBrokerProject((Project) target)); - } else if (clazz == RelatedSoftware.class) { - return (RT) new RelatedSoftware(sourceId, relType, - ConversionUtils.oafSoftwareToBrokerSoftware((Software) target)); - } else if (clazz == RelatedDataset.class) { - return (RT) new RelatedDataset(sourceId, relType, - ConversionUtils.oafDatasetToBrokerDataset((Dataset) target)); - } else if (clazz == RelatedPublication.class) { - return (RT) new RelatedPublication(sourceId, relType, - ConversionUtils.oafPublicationToBrokerPublication((Publication) target)); - } else { - return null; - } - } -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java index fafec1e19..74d19fe9d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProject.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; import java.io.Serializable; -import eu.dnetlib.broker.objects.Project; +import eu.dnetlib.broker.objects.OaBrokerProject; public class RelatedProject implements Serializable { @@ -13,15 +13,13 @@ public class RelatedProject implements Serializable { private static final long serialVersionUID = 4941437626549329870L; private String source; - private String relType; - private Project relProject; + private OaBrokerProject relProject; public RelatedProject() { } - public RelatedProject(final String source, final String relType, final Project relProject) { + public RelatedProject(final String source, final OaBrokerProject relProject) { this.source = source; - this.relType = relType; this.relProject = relProject; } @@ -33,19 +31,11 @@ public class RelatedProject implements Serializable { this.source = source; } - public String getRelType() { - return relType; - } - - public void setRelType(final String relType) { - this.relType = relType; - } - - public Project getRelProject() { + public OaBrokerProject getRelProject() { return relProject; } - public void setRelProject(final Project relProject) { + public void setRelProject(final OaBrokerProject relProject) { this.relProject = relProject; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProjectAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProjectAggregator.java new file mode 100644 index 000000000..3fedb1a32 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedProjectAggregator.java @@ -0,0 +1,60 @@ + +package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; + +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.expressions.Aggregator; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import scala.Tuple2; + +public class RelatedProjectAggregator + extends Aggregator, OaBrokerMainEntity, OaBrokerMainEntity> { + + /** + * + */ + private static final long serialVersionUID = 8559808519152275763L; + + @Override + public OaBrokerMainEntity zero() { + return new OaBrokerMainEntity(); + } + + @Override + public OaBrokerMainEntity finish(final OaBrokerMainEntity g) { + return g; + } + + @Override + public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2 t) { + final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1; + if (t._2 != null) { + res.getProjects().add(t._2.getRelProject()); + } + return res; + + } + + @Override + public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { + if (StringUtils.isNotBlank(g1.getOpenaireId())) { + g1.getProjects().addAll(g2.getProjects()); + return g1; + } else { + return g2; + } + } + + @Override + public Encoder bufferEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + + @Override + public Encoder outputEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java index 8a31ddf7e..ed6aeeab1 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublication.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; import java.io.Serializable; -import eu.dnetlib.broker.objects.Publication; +import eu.dnetlib.broker.objects.OaBrokerRelatedPublication; public class RelatedPublication implements Serializable { @@ -13,15 +13,13 @@ public class RelatedPublication implements Serializable { private static final long serialVersionUID = 9021609640411395128L; private String source; - private String relType; - private Publication relPublication; + private OaBrokerRelatedPublication relPublication; public RelatedPublication() { } - public RelatedPublication(final String source, final String relType, final Publication relPublication) { + public RelatedPublication(final String source, final OaBrokerRelatedPublication relPublication) { this.source = source; - this.relType = relType; this.relPublication = relPublication; } @@ -33,19 +31,11 @@ public class RelatedPublication implements Serializable { this.source = source; } - public String getRelType() { - return relType; - } - - public void setRelType(final String relType) { - this.relType = relType; - } - - public Publication getRelPublication() { + public OaBrokerRelatedPublication getRelPublication() { return relPublication; } - public void setRelPublication(final Publication relPublication) { + public void setRelPublication(final OaBrokerRelatedPublication relPublication) { this.relPublication = relPublication; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublicationAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublicationAggregator.java new file mode 100644 index 000000000..b331599ad --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedPublicationAggregator.java @@ -0,0 +1,61 @@ + +package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; + +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.expressions.Aggregator; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import scala.Tuple2; + +public class RelatedPublicationAggregator + extends Aggregator, OaBrokerMainEntity, OaBrokerMainEntity> { + + /** + * + */ + private static final long serialVersionUID = 4656934981558135919L; + + @Override + public OaBrokerMainEntity zero() { + return new OaBrokerMainEntity(); + } + + @Override + public OaBrokerMainEntity finish(final OaBrokerMainEntity g) { + return g; + } + + @Override + public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, + final Tuple2 t) { + final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1; + if (t._2 != null) { + res.getPublications().add(t._2.getRelPublication()); + } + return res; + + } + + @Override + public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { + if (StringUtils.isNotBlank(g1.getOpenaireId())) { + g1.getPublications().addAll(g2.getPublications()); + return g1; + } else { + return g2; + } + } + + @Override + public Encoder bufferEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + + @Override + public Encoder outputEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java index 319387469..0aa3a4045 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftware.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; import java.io.Serializable; -import eu.dnetlib.broker.objects.Software; +import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware; public class RelatedSoftware implements Serializable { @@ -11,16 +11,15 @@ public class RelatedSoftware implements Serializable { * */ private static final long serialVersionUID = 7573383356943300157L; + private String source; - private String relType; - private Software relSoftware; + private OaBrokerRelatedSoftware relSoftware; public RelatedSoftware() { } - public RelatedSoftware(final String source, final String relType, final Software relSoftware) { + public RelatedSoftware(final String source, final OaBrokerRelatedSoftware relSoftware) { this.source = source; - this.relType = relType; this.relSoftware = relSoftware; } @@ -32,19 +31,11 @@ public class RelatedSoftware implements Serializable { this.source = source; } - public String getRelType() { - return relType; - } - - public void setRelType(final String relType) { - this.relType = relType; - } - - public Software getRelSoftware() { + public OaBrokerRelatedSoftware getRelSoftware() { return relSoftware; } - public void setRelSoftware(final Software relSoftware) { + public void setRelSoftware(final OaBrokerRelatedSoftware relSoftware) { this.relSoftware = relSoftware; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftwareAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftwareAggregator.java new file mode 100644 index 000000000..d3b1c3407 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedSoftwareAggregator.java @@ -0,0 +1,60 @@ + +package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; + +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.expressions.Aggregator; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import scala.Tuple2; + +public class RelatedSoftwareAggregator + extends Aggregator, OaBrokerMainEntity, OaBrokerMainEntity> { + + /** + * + */ + private static final long serialVersionUID = -8987959389106443702L; + + @Override + public OaBrokerMainEntity zero() { + return new OaBrokerMainEntity(); + } + + @Override + public OaBrokerMainEntity finish(final OaBrokerMainEntity g) { + return g; + } + + @Override + public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2 t) { + final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1; + if (t._2 != null) { + res.getSoftwares().add(t._2.getRelSoftware()); + } + return res; + + } + + @Override + public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { + if (StringUtils.isNotBlank(g1.getOpenaireId())) { + g1.getSoftwares().addAll(g2.getSoftwares()); + return g1; + } else { + return g2; + } + } + + @Override + public Encoder bufferEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + + @Override + public Encoder outputEncoder() { + return Encoders.bean(OaBrokerMainEntity.class); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/common_params.json b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/common_params.json new file mode 100644 index 000000000..adee1888a --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/common_params.json @@ -0,0 +1,14 @@ +[ + { + "paramName": "g", + "paramLongName": "graphPath", + "paramDescription": "the path where there the graph is stored", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "workingPath", + "paramDescription": "the path where the temporary data will be stored", + "paramRequired": true + } +] diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml index ea9aabcfc..18e2eedca 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml @@ -6,8 +6,8 @@ the path where the graph is stored - eventsOutputPath - the path where the the events will be stored + workingPath + the path where the the generated data will be stored isLookupUrl @@ -73,18 +73,34 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + + + + + + + + + + + + + + + + + yarn cluster - GenerateEvents - eu.dnetlib.dhp.broker.oa.GenerateEventsApplication + PrepareSimpleEntititiesJob + eu.dnetlib.dhp.broker.oa.PrepareSimpleEntititiesJob dhp-broker-events-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -97,14 +113,183 @@ --conf spark.sql.shuffle.partitions=3840 --graphPath${graphInputPath} - --eventsPath${eventsOutputPath} + --workingPath${workingPath} + + + + + + + + + yarn + cluster + PrepareRelatedDatasetsJob + eu.dnetlib.dhp.broker.oa.PrepareRelatedDatasetsJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + yarn + cluster + PrepareRelatedProjectsJob + eu.dnetlib.dhp.broker.oa.PrepareRelatedProjectsJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + yarn + cluster + PrepareRelatedPublicationsJob + eu.dnetlib.dhp.broker.oa.PrepareRelatedPublicationsJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + yarn + cluster + PrepareRelatedSoftwaresJob + eu.dnetlib.dhp.broker.oa.PrepareRelatedSoftwaresJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + + + yarn + cluster + JoinEntitiesJob + eu.dnetlib.dhp.broker.oa.JoinEntitiesJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + yarn + cluster + PrepareGroupsJob + eu.dnetlib.dhp.broker.oa.PrepareGroupsJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + yarn + cluster + GenerateEventsJob + eu.dnetlib.dhp.broker.oa.GenerateEventsJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --workingPath${workingPath} --isLookupUrl${isLookupUrl} --dedupConfProfile${dedupConfProfId} - diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_broker_events.json b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json similarity index 70% rename from dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_broker_events.json rename to dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json index 6ab6d9a2d..7ae076159 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_broker_events.json +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json @@ -1,13 +1,7 @@ [ - { - "paramName": "g", - "paramLongName": "graphPath", - "paramDescription": "the path where there the graph is stored", - "paramRequired": true - }, { "paramName": "o", - "paramLongName": "eventsPath", + "paramLongName": "workingPath", "paramDescription": "the path where the generated events will be stored", "paramRequired": true }, diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/config-default.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/config-default.xml new file mode 100644 index 000000000..2e0ed9aee --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml new file mode 100644 index 000000000..1ccdef929 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml @@ -0,0 +1,158 @@ + + + + + graphInputPath + the path where the graph is stored + + + workingPath + the path where the the generated data will be stored + + + isLookupUrl + the address of the lookUp service + + + dedupConfProfId + the id of a valid Dedup Configuration Profile + + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + yarn + cluster + JoinEntitiesJob + eu.dnetlib.dhp.broker.oa.JoinEntitiesJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + yarn + cluster + PrepareGroupsJob + eu.dnetlib.dhp.broker.oa.PrepareGroupsJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + yarn + cluster + GenerateEventsJob + eu.dnetlib.dhp.broker.oa.GenerateEventsJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --workingPath${workingPath} + --isLookupUrl${isLookupUrl} + --dedupConfProfile${dedupConfProfId} + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-openaire/pom.xml b/dhp-workflows/dhp-dedup-openaire/pom.xml index 1f5f2620e..03ddbcf4c 100644 --- a/dhp-workflows/dhp-dedup-openaire/pom.xml +++ b/dhp-workflows/dhp-dedup-openaire/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.4-SNAPSHOT 4.0.0 dhp-dedup-openaire diff --git a/dhp-workflows/dhp-dedup-scholexplorer/pom.xml b/dhp-workflows/dhp-dedup-scholexplorer/pom.xml index e9e11b417..aa4070b01 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/pom.xml +++ b/dhp-workflows/dhp-dedup-scholexplorer/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.4-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-distcp/pom.xml b/dhp-workflows/dhp-distcp/pom.xml index 5707ddfc5..8c10538c0 100644 --- a/dhp-workflows/dhp-distcp/pom.xml +++ b/dhp-workflows/dhp-distcp/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.4-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-doiboost/pom.xml b/dhp-workflows/dhp-doiboost/pom.xml index 39bb81ec1..3299c1496 100644 --- a/dhp-workflows/dhp-doiboost/pom.xml +++ b/dhp-workflows/dhp-doiboost/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.4-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-enrichment/pom.xml b/dhp-workflows/dhp-enrichment/pom.xml index e71a72f3e..d0ab77cc5 100644 --- a/dhp-workflows/dhp-enrichment/pom.xml +++ b/dhp-workflows/dhp-enrichment/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.4-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml index f650f1c17..0439c2ba3 100644 --- a/dhp-workflows/dhp-graph-mapper/pom.xml +++ b/dhp-workflows/dhp-graph-mapper/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.4-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java index c90898814..8f43ab1cf 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java @@ -144,6 +144,9 @@ public class CleanGraphSparkJob { if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) { i.setAccessright(qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES)); } + if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) { + i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY); + } } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 9bd20303f..b1f0ecf0d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -111,6 +111,7 @@ public class MappersTest { assertNotNull(i.getAccessright()); assertEquals("OPEN", i.getAccessright().getClassid()); }); + assertEquals("0001", p.getInstance().get(0).getRefereed().getClassid()); assertNotNull(p.getBestaccessright()); assertEquals("OPEN", p.getBestaccessright().getClassid()); @@ -217,6 +218,7 @@ public class MappersTest { assertNotNull(i.getAccessright()); assertEquals("OPEN", i.getAccessright().getClassid()); }); + assertEquals("0001", d.getInstance().get(0).getRefereed().getClassid()); assertValidId(r1.getSource()); assertValidId(r1.getTarget()); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml index 2cb0ba1c7..ead22aa96 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml @@ -57,6 +57,7 @@ 10.3897/oneeco.2.e13718 https://oneecosystem.pensoft.net/article/13718/ One Ecosystem + 0001 diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml index 88ae9d106..5525a2753 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml @@ -90,6 +90,7 @@ corda_______::226852 + 0001s dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.4-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml index b0aec1e5d..fa1964773 100644 --- a/dhp-workflows/dhp-graph-provision/pom.xml +++ b/dhp-workflows/dhp-graph-provision/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.4-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-stats-update/pom.xml b/dhp-workflows/dhp-stats-update/pom.xml index 397bd8d08..52f35ff07 100644 --- a/dhp-workflows/dhp-stats-update/pom.xml +++ b/dhp-workflows/dhp-stats-update/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.4-SNAPSHOT 4.0.0 dhp-stats-update diff --git a/dhp-workflows/dhp-worfklow-profiles/pom.xml b/dhp-workflows/dhp-worfklow-profiles/pom.xml index e03362034..34996a021 100644 --- a/dhp-workflows/dhp-worfklow-profiles/pom.xml +++ b/dhp-workflows/dhp-worfklow-profiles/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.3-SNAPSHOT + 1.2.4-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/provision.xml b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_construction.xml similarity index 99% rename from dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/provision.xml rename to dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_construction.xml index 28cbde70d..819b3e12d 100644 --- a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/provision.xml +++ b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_construction.xml @@ -7,7 +7,7 @@ - Data Provision [OCEAN] + Graph Construction [OCEAN] Data Provision 30 diff --git a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_to_hiveDB.xml b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_to_hiveDB.xml new file mode 100644 index 000000000..0ace12ea3 --- /dev/null +++ b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_to_hiveDB.xml @@ -0,0 +1,73 @@ + +
+ + + + + +
+ + Graph to HiveDB [OCEAN] + Data Provision + 30 + + + Set the path containing the AGGREGATOR graph + + inputPath + + + + + + + + Set the target path to store the RAW graph + + hiveDbName + + + + + + + + + wait configurations + + + + + + + create the AGGREGATOR graph + + executeOozieJob + IIS + + { + 'inputPath' : 'inputPath', + 'hiveDbName' : 'hiveDbName' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/graph/hive/oozie_app' + } + + build-report + + + + + + + + + wf_20200615_163630_609 + 2020-06-15T17:08:00+00:00 + SUCCESS + + + +
\ No newline at end of file diff --git a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/update_solr.xml b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/update_solr.xml new file mode 100644 index 000000000..8a7738bcf --- /dev/null +++ b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/update_solr.xml @@ -0,0 +1,98 @@ + +
+ + + + + +
+ + Update Solr [OCEAN] + Data Provision + 30 + + + Set the path containing the AGGREGATOR graph + + inputGraphRootPath + + + + + + + + Set the target path to store the RAW graph + + format + TMF + + + + + + + Set the lookup address + + isLookupUrl + http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl + + + + + + + + wait configurations + + + + + + + create the AGGREGATOR graph + + executeOozieJob + IIS + + { + 'inputGraphRootPath' : 'inputGraphRootPath', + 'isLookupUrl' : 'isLookupUrl', + 'format' : 'format' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/provision/oozie_app', + 'maxRelations' : '100', + 'relPartitions' : '3000', + 'batchSize' : '2000', + 'relationFilter' : 'isAuthorInstitutionOf,produces,hasAmongTopNSimilarDocuments', + 'otherDsTypeId' : 'scholarcomminfra,infospace,pubsrepository::mock,entityregistry,entityregistry::projects,entityregistry::repositories,websource', + 'resumeFrom' : 'prepare_relations', + 'sparkDriverMemoryForJoining' : '3G', + 'sparkExecutorMemoryForJoining' : '7G', + 'sparkExecutorCoresForJoining' : '4', + 'sparkDriverMemoryForIndexing' : '2G', + 'sparkExecutorMemoryForIndexing' : '2G', + 'sparkExecutorCoresForIndexing' : '64', + 'sparkNetworkTimeout' : '600', + 'workingDir' : '/tmp/beta_provision/working_dir/update_solr' + } + + build-report + + + + + + + + + wf_20200615_163630_609 + 2020-06-15T17:08:00+00:00 + SUCCESS + + + +
\ No newline at end of file diff --git a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/update_stats.xml b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/update_stats.xml new file mode 100644 index 000000000..a91b6302e --- /dev/null +++ b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/update_stats.xml @@ -0,0 +1,74 @@ + +
+ + + + + +
+ + Update Stats [OCEAN] + Data Provision + 30 + + + Set the path containing the AGGREGATOR graph + + openaire_db_name + + + + + + + + Set the target path to store the RAW graph + + stats_db_name + + + + + + + + + wait configurations + + + + + + + create the AGGREGATOR graph + + executeOozieJob + IIS + + { + 'openaire_db_name' : 'openaire_db_name', + 'stats_db_name' : 'stats_db_name' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/graph/stats/oozie_app', + 'hive_timeout' : '3000' + } + + build-report + + + + + + + + + wf_20200615_163630_609 + 2020-06-15T17:08:00+00:00 + SUCCESS + + + +
\ No newline at end of file diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index 8d8d57c84..9fbc6d714 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp - 1.2.3-SNAPSHOT + 1.2.4-SNAPSHOT ../ diff --git a/pom.xml b/pom.xml index 06e2b7aaf..89b7e8829 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 eu.dnetlib.dhp dhp - 1.2.3-SNAPSHOT + 1.2.4-SNAPSHOT pom