From f3bc8aed31f8e865fbb54d9afc4d5d6130baedfc Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 18 May 2020 15:29:10 +0200 Subject: [PATCH 01/17] lifted memory requirements for country propagation wf --- .../src/main/resources/eu/dnetlib/dhp/wf/profiles/provision.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/provision.xml b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/provision.xml index 7c918a0d70..487afee4fa 100644 --- a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/provision.xml +++ b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/provision.xml @@ -521,6 +521,8 @@ { 'oozie.wf.application.path' : '/lib/dnet/oa/enrichment/country/oozie_app', + 'sparkExecutorCores' : '3', + 'sparkExecutorMemory' : '10G', 'workingDir' : '/tmp/beta_provision/working_dir/country', 'allowedtypes' : 'pubsrepository::institutional', 'whitelist' : '10|opendoar____::300891a62162b960cf02ce3827bb363c', From 0bdfbb0a5794125b4bcbd761a0490d708997bbdd Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 19 May 2020 15:02:21 +0200 Subject: [PATCH 02/17] reintroduced RDD based relation cut off procedure --- .../dhp/oa/provision/PrepareRelationsJob.java | 48 ++++++++++++++----- .../oa/provision/model/SortableRelation.java | 6 +-- .../input_params_prepare_relations.json | 14 +++++- .../dhp/oa/provision/oozie_app/workflow.xml | 4 +- 4 files changed, 56 insertions(+), 16 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java index dbdc54fc04..32a20d62c4 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java @@ -3,7 +3,9 @@ package eu.dnetlib.dhp.oa.provision; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import java.util.HashSet; import java.util.Optional; +import java.util.Set; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; @@ -15,17 +17,21 @@ import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.sources.In; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Splitter; import com.google.common.collect.Iterables; import com.google.common.collect.Iterators; +import com.google.common.collect.Sets; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.oa.provision.model.SortableRelation; import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner; +import scala.Int; import scala.Tuple2; /** @@ -58,6 +64,8 @@ public class PrepareRelationsJob { public static final int MAX_RELS = 100; + public static final int DEFAULT_NUM_PARTITIONS = 3000; + public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils .toString( @@ -79,6 +87,24 @@ public class PrepareRelationsJob { String outputPath = parser.get("outputPath"); log.info("outputPath: {}", outputPath); + int relPartitions = Optional + .ofNullable(parser.get("relPartitions")) + .map(Integer::valueOf) + .orElse(DEFAULT_NUM_PARTITIONS); + log.info("relPartitions: {}", relPartitions); + + Set relationFilter = Optional + .ofNullable(parser.get("relationFilter")) + .map(s -> Sets.newHashSet(Splitter.on(",").split(s))) + .orElse(new HashSet<>()); + log.info("relationFilter: {}", relationFilter); + + int maxRelations = Optional + .ofNullable(parser.get("maxRelations")) + .map(Integer::valueOf) + .orElse(MAX_RELS); + log.info("maxRelations: {}", maxRelations); + SparkConf conf = new SparkConf(); runWithSparkSession( @@ -86,12 +112,13 @@ public class PrepareRelationsJob { isSparkSessionManaged, spark -> { removeOutputDir(spark, outputPath); - prepareRelationsFromPaths(spark, inputRelationsPath, outputPath); + prepareRelationsRDDFromPaths( + spark, inputRelationsPath, outputPath, relationFilter, relPartitions, maxRelations); }); } private static void prepareRelationsFromPaths( - SparkSession spark, String inputRelationsPath, String outputPath) { + SparkSession spark, String inputRelationsPath, String outputPath, Set relationFilter) { readPathRelation(spark, inputRelationsPath) .filter("dataInfo.deletedbyinference == false") .groupByKey( @@ -125,20 +152,19 @@ public class PrepareRelationsJob { // TODO work in progress private static void prepareRelationsRDDFromPaths( - SparkSession spark, String inputRelationsPath, String outputPath, int numPartitions) { - JavaRDD rels = readPathRelationRDD(spark, inputRelationsPath).repartition(numPartitions); + SparkSession spark, String inputRelationsPath, String outputPath, Set relationFilter, int relPartitions, + int maxRelations) { + JavaRDD rels = readPathRelationRDD(spark, inputRelationsPath).repartition(relPartitions); + // only consider those that are not virtually deleted RDD d = rels - .filter(rel -> !rel.getDataInfo().getDeletedbyinference()) // only - // consider - // those - // that are not virtually - // deleted + .filter(rel -> !rel.getDataInfo().getDeletedbyinference()) + .filter(rel -> !relationFilter.contains(rel.getRelClass())) .mapToPair( (PairFunction) rel -> new Tuple2<>(rel, rel)) .groupByKey(new RelationPartitioner(rels.getNumPartitions())) - .map(p -> Iterables.limit(p._2(), MAX_RELS)) - .flatMap(p -> p.iterator()) + .map(group -> Iterables.limit(group._2(), maxRelations)) + .flatMap(group -> group.iterator()) .rdd(); spark diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java index 7c866001be..b6571b9bf9 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java @@ -16,10 +16,10 @@ public class SortableRelation extends Relation implements Comparable, static { weights.put("outcome", 0); weights.put("supplement", 1); - weights.put("publicationDataset", 2); + weights.put("affiliation", 2); weights.put("relationship", 3); - weights.put("similarity", 4); - weights.put("affiliation", 5); + weights.put("publicationDataset", 4); + weights.put("similarity", 5); weights.put("provision", 6); weights.put("participation", 7); diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json index bfb248d012..5ce37aa7be 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json @@ -21,6 +21,18 @@ "paramName": "rp", "paramLongName": "relPartitions", "paramDescription": "number or partitions for the relations Dataset", - "paramRequired": true + "paramRequired": false + }, + { + "paramName": "rf", + "paramLongName": "relationFilter", + "paramDescription": "filter applied reading relations (by relClass)", + "paramRequired": false + }, + { + "paramName": "mr", + "paramLongName": "maxRelations", + "paramDescription": "maximum number of relations applied reading relations (by relClass)", + "paramRequired": false } ] diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml index 298ac75892..a84db86880 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml @@ -102,7 +102,9 @@ --inputRelationsPath${inputGraphRootPath}/relation --outputPath${workingDir}/relation - --relPartitions3000 + --relPartitions${relPartitions} + --relationFilter${relationFilter} + --maxRelations${maxRelations} From 85ca5622d45569683191713e0d0e06dfcd3f3fbf Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 19 May 2020 16:17:35 +0200 Subject: [PATCH 03/17] partial implementation of generation of simple events --- .../eu/dnetlib/dhp/broker/model/Topic.java | 63 ++++++++++------ .../broker/oa/GenerateEventsApplication.java | 74 +++++++++++++++++-- .../EnrichMissingPublicationDate.java | 10 ++- .../dhp/broker/oa/matchers/UpdateMatcher.java | 10 ++- 4 files changed, 120 insertions(+), 37 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java index 29f6cbe3af..98088dd0aa 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java @@ -4,31 +4,48 @@ package eu.dnetlib.dhp.broker.model; public enum Topic { // ENRICHMENT MISSING - ENRICH_MISSING_OA_VERSION("ENRICH/MISSING/OPENACCESS_VERSION"), ENRICH_MISSING_ABSTRACT( - "ENRICH/MISSING/ABSTRACT"), ENRICH_MISSING_PUBLICATION_DATE( - "ENRICH/MISSING/PUBLICATION_DATE"), ENRICH_MISSING_PID( - "ENRICH/MISSING/PID"), ENRICH_MISSING_PROJECT("ENRICH/MISSING/PROJECT"), ENRICH_MISSING_SOFTWARE( - "ENRICH/MISSING/SOFTWARE"), ENRICH_MISSING_SUBJECT_MESHEUROPMC( - "ENRICH/MISSING/SUBJECT/MESHEUROPMC"), ENRICH_MISSING_SUBJECT_ARXIV( - "ENRICH/MISSING/SUBJECT/ARXIV"), ENRICH_MISSING_SUBJECT_JEL( - "ENRICH/MISSING/SUBJECT/JEL"), ENRICH_MISSING_SUBJECT_DDC( - "ENRICH/MISSING/SUBJECT/DDC"), ENRICH_MISSING_SUBJECT_ACM( - "ENRICH/MISSING/SUBJECT/ACM"), ENRICH_MISSING_SUBJECT_RVK( - "ENRICH/MISSING/SUBJECT/RVK"), ENRICH_MISSING_AUTHOR_ORCID( - "ENRICH/MISSING/AUTHOR/ORCID"), + ENRICH_MISSING_OA_VERSION("ENRICH/MISSING/OPENACCESS_VERSION"), + ENRICH_MISSING_ABSTRACT("ENRICH/MISSING/ABSTRACT"), + ENRICH_MISSING_PUBLICATION_DATE("ENRICH/MISSING/PUBLICATION_DATE"), + ENRICH_MISSING_PID("ENRICH/MISSING/PID"), + ENRICH_MISSING_PROJECT("ENRICH/MISSING/PROJECT"), + ENRICH_MISSING_SOFTWARE("ENRICH/MISSING/SOFTWARE"), + ENRICH_MISSING_SUBJECT_MESHEUROPMC("ENRICH/MISSING/SUBJECT/MESHEUROPMC"), + ENRICH_MISSING_SUBJECT_ARXIV("ENRICH/MISSING/SUBJECT/ARXIV"), + ENRICH_MISSING_SUBJECT_JEL("ENRICH/MISSING/SUBJECT/JEL"), + ENRICH_MISSING_SUBJECT_DDC("ENRICH/MISSING/SUBJECT/DDC"), + ENRICH_MISSING_SUBJECT_ACM("ENRICH/MISSING/SUBJECT/ACM"), + ENRICH_MISSING_SUBJECT_RVK("ENRICH/MISSING/SUBJECT/RVK"), + ENRICH_MISSING_AUTHOR_ORCID("ENRICH/MISSING/AUTHOR/ORCID"), // ENRICHMENT MORE - ENRICH_MORE_PID("ENRICH/MORE/PID"), ENRICH_MORE_OA_VERSION("ENRICH/MORE/OPENACCESS_VERSION"), ENRICH_MORE_ABSTRACT( - "ENRICH/MORE/ABSTRACT"), ENRICH_MORE_PUBLICATION_DATE("ENRICH/MORE/PUBLICATION_DATE"), ENRICH_MORE_PROJECT( - "ENRICH/MORE/PROJECT"), ENRICH_MORE_SUBJECT_MESHEUROPMC( - "ENRICH/MORE/SUBJECT/MESHEUROPMC"), ENRICH_MORE_SUBJECT_ARXIV( - "ENRICH/MORE/SUBJECT/ARXIV"), ENRICH_MORE_SUBJECT_JEL( - "ENRICH/MORE/SUBJECT/JEL"), ENRICH_MORE_SUBJECT_DDC( - "ENRICH/MORE/SUBJECT/DDC"), ENRICH_MORE_SUBJECT_ACM( - "ENRICH/MORE/SUBJECT/ACM"), ENRICH_MORE_SUBJECT_RVK("ENRICH/MORE/SUBJECT/RVK"), + ENRICH_MORE_PID("ENRICH/MORE/PID"), + ENRICH_MORE_OA_VERSION("ENRICH/MORE/OPENACCESS_VERSION"), + ENRICH_MORE_ABSTRACT("ENRICH/MORE/ABSTRACT"), + ENRICH_MORE_PUBLICATION_DATE("ENRICH/MORE/PUBLICATION_DATE"), + ENRICH_MORE_PROJECT("ENRICH/MORE/PROJECT"), + ENRICH_MORE_SUBJECT_MESHEUROPMC("ENRICH/MORE/SUBJECT/MESHEUROPMC"), + ENRICH_MORE_SUBJECT_ARXIV("ENRICH/MORE/SUBJECT/ARXIV"), + ENRICH_MORE_SUBJECT_JEL("ENRICH/MORE/SUBJECT/JEL"), + ENRICH_MORE_SUBJECT_DDC("ENRICH/MORE/SUBJECT/DDC"), + ENRICH_MORE_SUBJECT_ACM("ENRICH/MORE/SUBJECT/ACM"), + ENRICH_MORE_SUBJECT_RVK("ENRICH/MORE/SUBJECT/RVK"), // ADDITION - ADD_BY_PROJECT("ADD/BY_PROJECT"); + ADD_BY_PROJECT("ADD/BY_PROJECT"), + + // OTHER RELS + ENRICH_MISSING_PUBLICATION_IS_RELATED_TO("ENRICH/MISSING/PUBLICATION/IS_RELATED_TO"), + ENRICH_MISSING_PUBLICATION_REFERENCES("ENRICH/MISSING/PUBLICATION/REFERENCES"), + ENRICH_MISSING_PUBLICATION_IS_REFERENCED_BY("ENRICH/MISSING/PUBLICATION/IS_REFERENCED_BY"), + ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_TO("ENRICH/MISSING/PUBLICATION/IS_SUPPLEMENTED_TO"), + ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_BY("ENRICH/MISSING/PUBLICATION/IS_SUPPLEMENTED_BY"), + + ENRICH_MISSING_DATASET_IS_RELATED_TO("ENRICH/MISSING/DATASET/IS_RELATED_TO"), + ENRICH_MISSING_DATASET_REFERENCES("ENRICH/MISSING/DATASET/REFERENCES"), + ENRICH_MISSING_DATASET_IS_REFERENCED_BY("ENRICH/MISSING/DATASET/IS_REFERENCED_BY"), + ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_TO("ENRICH/MISSING/DATASET/IS_SUPPLEMENTED_TO"), + ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_BY("ENRICH/MISSING/DATASET/IS_SUPPLEMENTED_BY"); Topic(final String path) { this.path = path; @@ -42,9 +59,7 @@ public enum Topic { public static Topic fromPath(final String path) { for (final Topic t : Topic.values()) { - if (t.getPath().equals(path)) { - return t; - } + if (t.getPath().equals(path)) { return t; } } return null; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java index 43ebd6dd84..5fdd109252 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java @@ -9,11 +9,21 @@ import java.util.Optional; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.broker.model.Event; import eu.dnetlib.dhp.broker.model.EventFactory; @@ -30,7 +40,11 @@ import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreSubject; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.Software; public class GenerateEventsApplication { @@ -47,12 +61,13 @@ public class GenerateEventsApplication { private static final UpdateMatcher enrichMorePid = new EnrichMorePid(); private static final UpdateMatcher enrichMoreSubject = new EnrichMoreSubject(); + public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils - .toString( - GenerateEventsApplication.class - .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json"))); + .toString(GenerateEventsApplication.class + .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json"))); parser.parseArgument(args); final Boolean isSparkSessionManaged = Optional @@ -67,10 +82,23 @@ public class GenerateEventsApplication { final String eventsPath = parser.get("eventsPath"); log.info("eventsPath: {}", eventsPath); + final String resultClassName = parser.get("resultTableName"); + log.info("resultTableName: {}", resultClassName); + final SparkConf conf = new SparkConf(); + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); removeOutputDir(spark, eventsPath); - generateEvents(spark, graphPath, eventsPath); + + final JavaRDD eventsRdd = sc.emptyRDD(); + + eventsRdd.union(generateSimpleEvents(spark, graphPath, Publication.class)); + eventsRdd.union(generateSimpleEvents(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class)); + eventsRdd.union(generateSimpleEvents(spark, graphPath, Software.class)); + eventsRdd.union(generateSimpleEvents(spark, graphPath, OtherResearchProduct.class)); + + eventsRdd.saveAsTextFile(eventsPath, GzipCodec.class); }); } @@ -79,11 +107,34 @@ public class GenerateEventsApplication { HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); } - private static void generateEvents(final SparkSession spark, final String graphPath, final String eventsPath) { - // TODO + private static JavaRDD generateSimpleEvents(final SparkSession spark, + final String graphPath, + final Class resultClazz) { + + final Dataset results = + readPath(spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), resultClazz) + .filter(r -> r.getDataInfo().getDeletedbyinference()); + + final Dataset rels = + readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> r.getRelClass().equals("TODO")); // TODO mergedIN + + final Column c = null; // TODO + + final Dataset aa = results.joinWith(rels, results.col("id").equalTo(rels.col("source")), "inner") + .groupBy(rels.col("target")) + .agg(c) + .filter(x -> x.size() > 1) + // generateSimpleEvents(...) + // flatMap() + // toRdd() + ; + + return null; + } - private List generateEvents(final Result... children) { + private List generateSimpleEvents(final Result... children) { final List> list = new ArrayList<>(); for (final Result target : children) { @@ -102,4 +153,13 @@ public class GenerateEventsApplication { return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList()); } + public static Dataset readPath( + final SparkSession spark, + final String inputPath, + final Class clazz) { + return spark + .read() + .textFile(inputPath) + .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); + } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java index e9ec082c46..372a4e4c9d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.broker.oa.matchers; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -16,12 +17,15 @@ public class EnrichMissingPublicationDate extends UpdateMatcher { @Override protected List> findUpdates(final Result source, final Result target) { - // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); - return Arrays.asList(); + if (isMissing(target.getDateofacceptance()) && !isMissing(source.getDateofacceptance())) { + return Arrays.asList(generateUpdateInfo(source.getDateofacceptance().getValue(), source, target)); + } + return new ArrayList<>(); } @Override - public UpdateInfo generateUpdateInfo(final String highlightValue, final Result source, + public UpdateInfo generateUpdateInfo(final String highlightValue, + final Result source, final Result target) { return new UpdateInfo<>( Topic.ENRICH_MISSING_PUBLICATION_DATE, diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java index b8b6132cd1..d91b03200e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java @@ -30,8 +30,7 @@ public abstract class UpdateMatcher { if (source != res) { for (final UpdateInfo info : findUpdates(source, res)) { final String s = DigestUtils.md5Hex(info.getHighlightValueAsString()); - if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) { - } else { + if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {} else { infoMap.put(s, info); } } @@ -54,11 +53,16 @@ public abstract class UpdateMatcher { protected abstract List> findUpdates(Result source, Result target); - protected abstract UpdateInfo generateUpdateInfo(final T highlightValue, final Result source, + protected abstract UpdateInfo generateUpdateInfo(final T highlightValue, + final Result source, final Result target); protected static boolean isMissing(final List> list) { return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0).getValue()); } + protected boolean isMissing(final Field field) { + return field == null || StringUtils.isBlank(field.getValue()); + } + } From d7d2a0637f262f3e51c519db78744ef8c713de0f Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 20 May 2020 14:55:38 +0200 Subject: [PATCH 04/17] added extra parameters to the provision indexing workflow --- .../input_params_prepare_relations.json | 2 +- .../dhp/oa/provision/oozie_app/workflow.xml | 25 +++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json index 5ce37aa7be..71b2becc4d 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json @@ -32,7 +32,7 @@ { "paramName": "mr", "paramLongName": "maxRelations", - "paramDescription": "maximum number of relations applied reading relations (by relClass)", + "paramDescription": "maximum number of relations allowed for a each entity", "paramRequired": false } ] diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml index a84db86880..9600ccbaa4 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml @@ -9,6 +9,30 @@ isLookupUrl URL for the isLookup service + + relPartitions + number or partitions for the relations Dataset + + + relationFilter + filter applied reading relations (by relClass) + + + maxRelations + maximum number of relations allowed for a each entity + + + otherDsTypeId + mapping used to populate datasourceTypeUi field + + + format + metadata format name (DMF|TMF) + + + batchSize + number of records to be included in each indexing request + sparkDriverMemoryForJoining @@ -421,4 +445,5 @@ + \ No newline at end of file From b3bcbb3129fb26b3222e85cbfc7c421ab6fbeaa3 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Thu, 21 May 2020 08:41:32 +0200 Subject: [PATCH 05/17] resolve name of organization countries --- .../eu/dnetlib/dhp/oa/graph/sql/queryOrganizations.sql | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOrganizations.sql b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOrganizations.sql index aeb04aff91..4dbd1c4298 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOrganizations.sql +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOrganizations.sql @@ -22,12 +22,13 @@ SELECT '' AS inferenceprovenance, d.id AS collectedfromid, d.officialname AS collectedfromname, - o.country || '@@@' || o.country || '@@@dnet:countries@@@dnet:countries' AS country, + o.country || '@@@' || cntr.name || '@@@dnet:countries@@@dnet:countries' AS country, 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction, - ARRAY[]::text[] AS pid + FROM dsm_organizations o LEFT OUTER JOIN dsm_datasources d ON (d.id = o.collectedfrom) + LEFT OUTER JOIN class cntr ON (cntr.code = o.country) From dbfb9c19fe12181c61ef0435937b298a863ed555 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 21 May 2020 10:00:14 +0200 Subject: [PATCH 06/17] minor changes --- .../dhp/oa/provision/PrepareRelationsJob.java | 82 ++++++++++++------- .../dhp/oa/provision/oozie_app/workflow.xml | 9 +- 2 files changed, 59 insertions(+), 32 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java index 32a20d62c4..72d68a389e 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java @@ -17,7 +17,6 @@ import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.sources.In; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -31,7 +30,6 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.oa.provision.model.SortableRelation; import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner; -import scala.Int; import scala.Tuple2; /** @@ -112,26 +110,74 @@ public class PrepareRelationsJob { isSparkSessionManaged, spark -> { removeOutputDir(spark, outputPath); - prepareRelationsRDDFromPaths( + prepareRelationsRDD( spark, inputRelationsPath, outputPath, relationFilter, relPartitions, maxRelations); }); } - private static void prepareRelationsFromPaths( - SparkSession spark, String inputRelationsPath, String outputPath, Set relationFilter) { + /** + * Dataset based implementation that prepares the graph relations by limiting the number of outgoing links and + * filtering the relation types according to the given criteria. + * + * @param spark the spark session + * @param inputRelationsPath source path for the graph relations + * @param outputPath output path for the processed relations + * @param relationFilter set of relation filters applied to the `relClass` field + * @param maxRelations maximum number of allowed outgoing edges + */ + private static void prepareRelations( + SparkSession spark, String inputRelationsPath, String outputPath, Set relationFilter, + int maxRelations) { readPathRelation(spark, inputRelationsPath) .filter("dataInfo.deletedbyinference == false") + .filter((FilterFunction) rel -> !relationFilter.contains(rel.getRelClass())) .groupByKey( (MapFunction) value -> value.getSource(), Encoders.STRING()) .flatMapGroups( (FlatMapGroupsFunction) (key, values) -> Iterators - .limit(values, MAX_RELS), + .limit(values, maxRelations), Encoders.bean(SortableRelation.class)) .write() .mode(SaveMode.Overwrite) .parquet(outputPath); } + /** + * RDD based implementation that prepares the graph relations by limiting the number of outgoing links and filtering + * the relation types according to the given criteria. Moreover, outgoing links kept within the given limit are + * prioritized according to the weights indicated in eu.dnetlib.dhp.oa.provision.model.SortableRelation. + * + * @param spark the spark session + * @param inputRelationsPath source path for the graph relations + * @param outputPath output path for the processed relations + * @param relationFilter set of relation filters applied to the `relClass` field + * @param maxRelations maximum number of allowed outgoing edges + */ + // TODO work in progress + private static void prepareRelationsRDD( + SparkSession spark, String inputRelationsPath, String outputPath, Set relationFilter, int relPartitions, + int maxRelations) { + JavaRDD rels = readPathRelationRDD(spark, inputRelationsPath).repartition(relPartitions); + RelationPartitioner partitioner = new RelationPartitioner(rels.getNumPartitions()); + + // only consider those that are not virtually deleted + RDD d = rels + .filter(rel -> !rel.getDataInfo().getDeletedbyinference()) + .filter(rel -> !relationFilter.contains(rel.getRelClass())) + .mapToPair( + (PairFunction) rel -> new Tuple2<>(rel, rel)) + .groupByKey(partitioner) + .map(group -> Iterables.limit(group._2(), maxRelations)) + .flatMap(group -> group.iterator()) + .rdd(); + + spark + .createDataset(d, Encoders.bean(SortableRelation.class)) + .write() + .mode(SaveMode.Overwrite) + .parquet(outputPath); + } + /** * Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text * file, @@ -150,30 +196,6 @@ public class PrepareRelationsJob { Encoders.bean(SortableRelation.class)); } - // TODO work in progress - private static void prepareRelationsRDDFromPaths( - SparkSession spark, String inputRelationsPath, String outputPath, Set relationFilter, int relPartitions, - int maxRelations) { - JavaRDD rels = readPathRelationRDD(spark, inputRelationsPath).repartition(relPartitions); - - // only consider those that are not virtually deleted - RDD d = rels - .filter(rel -> !rel.getDataInfo().getDeletedbyinference()) - .filter(rel -> !relationFilter.contains(rel.getRelClass())) - .mapToPair( - (PairFunction) rel -> new Tuple2<>(rel, rel)) - .groupByKey(new RelationPartitioner(rels.getNumPartitions())) - .map(group -> Iterables.limit(group._2(), maxRelations)) - .flatMap(group -> group.iterator()) - .rdd(); - - spark - .createDataset(d, Encoders.bean(SortableRelation.class)) - .write() - .mode(SaveMode.Overwrite) - .parquet(outputPath); - } - private static JavaRDD readPathRelationRDD( SparkSession spark, final String inputPath) { JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml index 9600ccbaa4..fd8f5ba89a 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml @@ -80,6 +80,11 @@ spark2EventLogDir spark 2.* event log dir location + + sparkNetworkTimeout + configures spark.network.timeout + + @@ -357,7 +362,7 @@ --inputGraphRootPath${inputGraphRootPath} --inputRelatedEntitiesPath${workingDir}/join_partial --outputPath${workingDir}/join_entities - --numPartitions12000 + --numPartitions24000 @@ -381,7 +386,7 @@ --conf spark.sql.shuffle.partitions=7680 --conf spark.network.timeout=${sparkNetworkTimeout} - --inputPath ${workingDir}/join_entities + --inputPath${workingDir}/join_entities --outputPath${workingDir}/joined From e43d4d7778851846bb0f4f361996682d2c3056f4 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Thu, 21 May 2020 11:08:07 +0200 Subject: [PATCH 07/17] added a coalesce in sql query --- .../eu/dnetlib/dhp/oa/graph/sql/queryOrganizations.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOrganizations.sql b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOrganizations.sql index 4dbd1c4298..d13bd43425 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOrganizations.sql +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOrganizations.sql @@ -22,7 +22,7 @@ SELECT '' AS inferenceprovenance, d.id AS collectedfromid, d.officialname AS collectedfromname, - o.country || '@@@' || cntr.name || '@@@dnet:countries@@@dnet:countries' AS country, + o.country || '@@@' || COALESCE(cntr.name,o.country) || '@@@dnet:countries@@@dnet:countries' AS country, 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction, ARRAY[]::text[] AS pid From 8bbd1d0501ada82c6a037e0379c71006723544ab Mon Sep 17 00:00:00 2001 From: miconis Date: Thu, 21 May 2020 11:52:14 +0200 Subject: [PATCH 08/17] reimplementation of the author merging in deduprecord creation. implementation of the test class. --- .../eu/dnetlib/dhp/oa/dedup/AuthorMerger.java | 159 ++++++++++++++++++ .../dhp/oa/dedup/DedupRecordFactory.java | 23 +-- .../eu/dnetlib/dhp/oa/dedup/DedupUtility.java | 112 ------------ .../dhp/oa/dedup/EntityMergerTest.java | 138 +++++++++++++++ .../dnetlib/dhp/oa/dedup/MergeAuthorTest.java | 54 ------ .../dhp/dedup/json/publication_merge.json | 3 + 6 files changed, 313 insertions(+), 176 deletions(-) create mode 100644 dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java create mode 100644 dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java delete mode 100644 dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/MergeAuthorTest.java create mode 100644 dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/publication_merge.json diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java new file mode 100644 index 0000000000..108f4a4bec --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java @@ -0,0 +1,159 @@ +package eu.dnetlib.dhp.oa.dedup; + +import com.wcohen.ss.JaroWinkler; +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.pace.model.Person; +import org.apache.commons.lang3.StringUtils; +import scala.Tuple2; + +import java.text.Normalizer; +import java.util.*; +import java.util.stream.Collectors; + +public class AuthorMerger { + + private static final Double THRESHOLD = 0.95; + + public static List merge(List> authors){ + + authors.sort(new Comparator>() { + @Override + public int compare(List o1, List o2) { + return -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)); + } + }); + + List author = new ArrayList<>(); + + for(List a : authors){ + author = mergeAuthor(author, a); + } + + return author; + + } + + public static List mergeAuthor(final List a, final List b) { + int pa = countAuthorsPids(a); + int pb = countAuthorsPids(b); + List base, enrich; + int sa = authorsSize(a); + int sb = authorsSize(b); + + if (pa == pb) { + base = sa > sb ? a : b; + enrich = sa > sb ? b : a; + } else { + base = pa > pb ? a : b; + enrich = pa > pb ? b : a; + } + enrichPidFromList(base, enrich); + return base; + } + + private static void enrichPidFromList(List base, List enrich) { + if (base == null || enrich == null) + return; + final Map basePidAuthorMap = base + .stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap( + a -> a + .getPid() + .stream() + .map(p -> new Tuple2<>(pidToComparableString(p), a))) + .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); + + final List> pidToEnrich = enrich + .stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap( + a -> a + .getPid() + .stream() + .filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p))) + .map(p -> new Tuple2<>(p, a))) + .collect(Collectors.toList()); + + pidToEnrich + .forEach( + a -> { + Optional> simAuthor = base + .stream() + .map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) + .max(Comparator.comparing(Tuple2::_1)); + if (simAuthor.isPresent() && simAuthor.get()._1() > THRESHOLD) { + Author r = simAuthor.get()._2(); + if (r.getPid() == null) { + r.setPid(new ArrayList<>()); + } + r.getPid().add(a._1()); + } + }); + } + + public static String pidToComparableString(StructuredProperty pid){ + return (pid.getQualifier()!=null? pid.getQualifier().getClassid()!=null?pid.getQualifier().getClassid().toLowerCase():"":"") + (pid.getValue()!=null? pid.getValue().toLowerCase():""); + } + + public static int countAuthorsPids(List authors) { + if (authors == null) + return 0; + + return (int) authors.stream().filter(AuthorMerger::hasPid).count(); + } + + private static int authorsSize(List authors) { + if (authors == null) + return 0; + return authors.size(); + } + + private static Double sim(Author a, Author b) { + + final Person pa = parse(a); + final Person pb = parse(b); + + if (pa.isAccurate() & pb.isAccurate()) { + return new JaroWinkler() + .score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())); + } else { + return new JaroWinkler() + .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname())); + } + } + + private static boolean hasPid(Author a) { + if (a == null || a.getPid() == null || a.getPid().size() == 0) + return false; + return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); + } + + private static Person parse(Author author) { + if (StringUtils.isNotBlank(author.getSurname())) { + return new Person(author.getSurname() + ", " + author.getName(), false); + } else { + return new Person(author.getFullname(), false); + } + } + + private static String normalize(final String s) { + return nfd(s) + .toLowerCase() + // do not compact the regexes in a single expression, would cause StackOverflowError + // in case + // of large input strings + .replaceAll("(\\W)+", " ") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim(); + } + + private static String nfd(final String s) { + return Normalizer.normalize(s, Normalizer.Form.NFD); + } + +} diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java index fa06424d7a..eed783e536 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java @@ -1,8 +1,9 @@ - package eu.dnetlib.dhp.oa.dedup; +import java.io.Serializable; import java.util.Collection; import java.util.Iterator; +import java.util.List; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapGroupsFunction; @@ -67,16 +68,18 @@ public class DedupRecordFactory { (MapFunction, String>) entity -> entity._1(), Encoders.STRING()) .mapGroups( (MapGroupsFunction, T>) (key, - values) -> entityMerger(key, values, ts, dataInfo), + values) -> entityMerger(key, values, ts, dataInfo, clazz), Encoders.bean(clazz)); } - private static T entityMerger( - String id, Iterator> entities, long ts, DataInfo dataInfo) { + public static T entityMerger( + String id, Iterator> entities, long ts, DataInfo dataInfo, Class clazz) throws IllegalAccessException, InstantiationException { - T entity = entities.next()._2(); + T entity = clazz.newInstance(); final Collection dates = Lists.newArrayList(); + final List> authors = Lists.newArrayList(); + entities .forEachRemaining( t -> { @@ -84,17 +87,17 @@ public class DedupRecordFactory { entity.mergeFrom(duplicate); if (ModelSupport.isSubClass(duplicate, Result.class)) { Result r1 = (Result) duplicate; - Result er = (Result) entity; - er.setAuthor(DedupUtility.mergeAuthor(er.getAuthor(), r1.getAuthor())); - - if (r1.getDateofacceptance() != null) { + if (r1.getAuthor() != null && r1.getAuthor().size()>0) + authors.add(r1.getAuthor()); + if (r1.getDateofacceptance() != null) dates.add(r1.getDateofacceptance().getValue()); - } } }); + //set authors and date if (ModelSupport.isSubClass(entity, Result.class)) { ((Result) entity).setDateofacceptance(DatePicker.pick(dates)); + ((Result) entity).setAuthor(AuthorMerger.merge(authors)); } entity.setId(id); diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java index d3ae8ee4f9..222794d64d 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java @@ -32,7 +32,6 @@ import eu.dnetlib.pace.model.Person; import scala.Tuple2; public class DedupUtility { - private static final Double THRESHOLD = 0.95; public static Map constructAccumulator( final DedupConfig dedupConf, final SparkContext context) { @@ -82,58 +81,6 @@ public class DedupUtility { } } - public static List mergeAuthor(final List a, final List b) { - int pa = countAuthorsPids(a); - int pb = countAuthorsPids(b); - List base, enrich; - int sa = authorsSize(a); - int sb = authorsSize(b); - - if (pa == pb) { - base = sa > sb ? a : b; - enrich = sa > sb ? b : a; - } else { - base = pa > pb ? a : b; - enrich = pa > pb ? b : a; - } - enrichPidFromList(base, enrich); - return base; - } - - private static void enrichPidFromList(List base, List enrich) { - if (base == null || enrich == null) - return; - final Map basePidAuthorMap = base - .stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap(a -> a.getPid().stream().map(p -> new Tuple2<>(p.toComparableString(), a))) - .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); - - final List> pidToEnrich = enrich - .stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap( - a -> a - .getPid() - .stream() - .filter(p -> !basePidAuthorMap.containsKey(p.toComparableString())) - .map(p -> new Tuple2<>(p, a))) - .collect(Collectors.toList()); - - pidToEnrich - .forEach( - a -> { - Optional> simAuhtor = base - .stream() - .map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) - .max(Comparator.comparing(Tuple2::_1)); - if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) { - Author r = simAuhtor.get()._2(); - r.getPid().add(a._1()); - } - }); - } - public static String createDedupRecordPath( final String basePath, final String actionSetId, final String entityType) { return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType); @@ -153,65 +100,6 @@ public class DedupUtility { return String.format("%s/%s/%s_mergerel", basePath, actionSetId, entityType); } - private static Double sim(Author a, Author b) { - - final Person pa = parse(a); - final Person pb = parse(b); - - if (pa.isAccurate() & pb.isAccurate()) { - return new JaroWinkler() - .score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())); - } else { - return new JaroWinkler() - .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname())); - } - } - - private static String normalize(final String s) { - return nfd(s) - .toLowerCase() - // do not compact the regexes in a single expression, would cause StackOverflowError - // in case - // of large input strings - .replaceAll("(\\W)+", " ") - .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") - .replaceAll("(\\p{Punct})+", " ") - .replaceAll("(\\d)+", " ") - .replaceAll("(\\n)+", " ") - .trim(); - } - - private static String nfd(final String s) { - return Normalizer.normalize(s, Normalizer.Form.NFD); - } - - private static Person parse(Author author) { - if (StringUtils.isNotBlank(author.getSurname())) { - return new Person(author.getSurname() + ", " + author.getName(), false); - } else { - return new Person(author.getFullname(), false); - } - } - - private static int countAuthorsPids(List authors) { - if (authors == null) - return 0; - - return (int) authors.stream().filter(DedupUtility::hasPid).count(); - } - - private static int authorsSize(List authors) { - if (authors == null) - return 0; - return authors.size(); - } - - private static boolean hasPid(Author a) { - if (a == null || a.getPid() == null || a.getPid().size() == 0) - return false; - return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); - } - public static List getConfigurations(String isLookUpUrl, String orchestrator) throws ISLookUpException, DocumentException { final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookUpUrl); diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java new file mode 100644 index 0000000000..526dd73bb0 --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java @@ -0,0 +1,138 @@ +package eu.dnetlib.dhp.oa.dedup; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.io.Serializable; +import java.nio.file.Paths; +import java.util.*; + +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.pace.util.MapDocumentUtil; +import org.codehaus.jackson.map.ObjectMapper; +import org.junit.jupiter.api.BeforeEach; + +import org.junit.jupiter.api.Test; +import scala.Tuple2; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class EntityMergerTest implements Serializable { + + List> publications; + + String testEntityBasePath; + DataInfo dataInfo; + String dedupId = "dedup_id"; + Publication pub_top; + + @BeforeEach + public void setUp() throws Exception { + + testEntityBasePath = Paths + .get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI()) + .toFile() + .getAbsolutePath(); + + publications = readSample(testEntityBasePath + "/publication_merge.json", Publication.class); + + pub_top = getTopPub(publications); + + dataInfo = setDI(); + + } + + @Test + public void publicationMergerTest() throws InstantiationException, IllegalAccessException { + + Publication pub_merged = DedupRecordFactory.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class); + + assertEquals(dedupId, pub_merged.getId()); + + assertEquals(pub_merged.getJournal(), pub_top.getJournal()); + assertEquals(pub_merged.getBestaccessright(), pub_top.getBestaccessright()); + assertEquals(pub_merged.getResulttype(), pub_top.getResulttype()); + assertEquals(pub_merged.getLanguage(), pub_merged.getLanguage()); + assertEquals(pub_merged.getPublisher(), pub_top.getPublisher()); + assertEquals(pub_merged.getEmbargoenddate(), pub_top.getEmbargoenddate()); + assertEquals(pub_merged.getResourcetype().getClassid(), "0004"); + assertEquals(pub_merged.getDateoftransformation(), pub_top.getDateoftransformation()); + assertEquals(pub_merged.getOaiprovenance(), pub_top.getOaiprovenance()); + assertEquals(pub_merged.getDateofcollection(), pub_top.getDateofcollection()); + assertEquals(pub_merged.getInstance().size(),3); + assertEquals(pub_merged.getCountry().size(), 2); + assertEquals(pub_merged.getSubject().size(), 0); + assertEquals(pub_merged.getTitle().size(), 2); + assertEquals(pub_merged.getRelevantdate().size(),0); + assertEquals(pub_merged.getDescription().size(),0); + assertEquals(pub_merged.getSource().size(),0); + assertEquals(pub_merged.getFulltext().size(),0); + assertEquals(pub_merged.getFormat().size(),0); + assertEquals(pub_merged.getContributor().size(),0); + assertEquals(pub_merged.getCoverage().size(),0); + assertEquals(pub_merged.getContext().size(),0); + assertEquals(pub_merged.getExternalReference().size(),0); + assertEquals(pub_merged.getOriginalId().size(),3); + assertEquals(pub_merged.getCollectedfrom().size(),3); + assertEquals(pub_merged.getPid().size(),1); + assertEquals(pub_merged.getExtraInfo().size(),0); + + //verify datainfo + assertEquals(pub_merged.getDataInfo(), dataInfo); + + //verify datepicker + assertEquals(pub_merged.getDateofacceptance().getValue(), "2018-09-30"); + + //verify authors + assertEquals(pub_merged.getAuthor().size(), 9); + assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4); + } + + public DataInfo setDI(){ + DataInfo dataInfo = new DataInfo(); + dataInfo.setTrust("0.9"); + dataInfo.setDeletedbyinference(false); + dataInfo.setInferenceprovenance("testing"); + dataInfo.setInferred(true); + return dataInfo; + } + + public Publication getTopPub(List> publications){ + + Double maxTrust = 0.0; + Publication maxPub = new Publication(); + for (Tuple2 publication : publications) { + Double pubTrust = Double.parseDouble(publication._2().getDataInfo().getTrust()); + if(pubTrust > maxTrust){ + maxTrust = pubTrust; + maxPub = publication._2(); + } + } + return maxPub; + } + + public List> readSample(String path, Class clazz) { + List> res = new ArrayList<>(); + BufferedReader reader; + try { + reader = new BufferedReader(new FileReader(path)); + String line = reader.readLine(); + while (line != null) { + res.add( + new Tuple2<>( + MapDocumentUtil.getJPathString("$.id", line), + new ObjectMapper().readValue(line, clazz)) + ); + // read next line + line = reader.readLine(); + } + reader.close(); + } catch (IOException e) { + e.printStackTrace(); + } + + return res; + } + + +} diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/MergeAuthorTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/MergeAuthorTest.java deleted file mode 100644 index a217a2657f..0000000000 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/MergeAuthorTest.java +++ /dev/null @@ -1,54 +0,0 @@ - -package eu.dnetlib.dhp.oa.dedup; - -import java.io.IOException; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; - -import org.apache.commons.io.IOUtils; -import org.codehaus.jackson.map.ObjectMapper; -import org.junit.jupiter.api.BeforeEach; - -import eu.dnetlib.dhp.schema.oaf.Publication; - -public class MergeAuthorTest { - - private List publicationsToMerge; - private final ObjectMapper mapper = new ObjectMapper(); - - @BeforeEach - public void setUp() throws Exception { - final String json = IOUtils - .toString( - this.getClass().getResourceAsStream("/eu/dnetlib/dhp/dedup/json/authors_merge.json")); - - publicationsToMerge = Arrays - .asList(json.split("\n")) - .stream() - .map( - s -> { - try { - return mapper.readValue(s, Publication.class); - } catch (IOException e) { - throw new RuntimeException(e); - } - }) - .collect(Collectors.toList()); - } - - // FIX ME Michele DB this tests doesn't work - // @Test - public void test() throws Exception { - Publication dedup = new Publication(); - - publicationsToMerge - .forEach( - p -> { - dedup.mergeFrom(p); - dedup.setAuthor(DedupUtility.mergeAuthor(dedup.getAuthor(), p.getAuthor())); - }); - - System.out.println(mapper.writeValueAsString(dedup)); - } -} diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/publication_merge.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/publication_merge.json new file mode 100644 index 0000000000..015f9294a3 --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/publication_merge.json @@ -0,0 +1,3 @@ +{"context": [], "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive", "classname": "sysimport:crosswalk:datasetarchive", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "dedup-similarity-result-levenstein", "invisible": false, "trust": "0.95"}, "resourcetype": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "pid": [], "contributor": [], "resulttype": {"classid": "publication", "classname": "publication", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "relevantdate": [], "collectedfrom": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "Journal.fi", "key": "10|openaire____::6eef8049d0feedc089ee009abca55e35"}], "id": "50|a89337edbe55::4930db9e954866d70916cbfba9f81f97", "subject": [], "instance": [{"refereed": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "hostedby": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "Journal.fi", "key": "10|openaire____::6eef8049d0feedc089ee009abca55e35"}, "processingchargeamount": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "license": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "url": [], "distributionlocation": "", "processingchargecurrency": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2016-01-01"}, "collectedfrom": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "Journal.fi", "key": "10|openaire____::6eef8049d0feedc089ee009abca55e35"}, "accessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0001", "classname": "Article", "schemename": "dnet:dataCite_resource", "schemeid": "dnet:dataCite_resource"}}], "embargoenddate": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "lastupdatetimestamp": 0, "author": [{"surname": "Go\\u0308tz", "name": "Julia", "pid": [], "rank": 5, "affiliation": [], "fullname": "G\\u00f6tz, Julia"}, {"surname": "Wolff", "name": "Stephan", "pid": [], "rank": 6, "affiliation": [], "fullname": "Wolff, Stephan"}, {"surname": "Jansen", "name": "Olav", "pid": [], "rank": 7, "affiliation": [], "fullname": "Jansen, Olav"}, {"surname": "Dressler", "name": "Dirk", "pid": [{"qualifier": {"classid": "ORCID", "classname": "ORCID"}, "value": "0000-0000-0656-9999"},{"qualifier": {"classid": "id", "classname": "id"}, "value": "987654321"}], "rank": 8, "affiliation": [], "fullname": "Dressler, Dirk"}, {"surname": "Schneider", "name": "Susanne A.", "pid": [], "rank": 9, "affiliation": [], "fullname": "Schneider, Susanne A."}], "source": [], "dateofcollection": "2019-11-05T14:49:22.351Z", "fulltext": [], "dateoftransformation": "2019-11-05T16:10:58.988Z", "description": [], "format": [], "journal": {"issnPrinted": "1459-6067", "conferencedate": "", "conferenceplace": "", "name": "Agricultural and Food Science", "edition": "", "iss": "3", "sp": "", "vol": "27", "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "issnOnline": "1795-1895", "ep": "", "issnLinking": ""}, "coverage": [], "publisher": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "language": {"classid": "eng", "classname": "English", "schemename": "dnet:languages", "schemeid": "dnet:languages"}, "bestaccessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "country": [], "extraInfo": [], "originalId": [], "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2018-09-30"}, "title": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Altered brain activation in a reversal learning task unmasks adaptive changes in cognitive control in writer's cramp"}]} +{"context": [], "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "dedup-similarity-result-levenstein", "invisible": false, "trust": "0.9"}, "resourcetype": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "pid": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "qualifier": {"classid": "doi", "classname": "doi", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.1016/j.nicl.2015.11.006"}], "contributor": [], "resulttype": {"classid": "publication", "classname": "publication", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "relevantdate": [], "collectedfrom": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "BASE (Open Access Aggregator)", "key": "10|openaire____::df45502607927471ecf8a6ae83683ff5"}], "id": "50|base_oa_____::0968af610a356656706657e4f234b340", "subject": [], "instance": [{"refereed": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "hostedby": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "NeuroImage: Clinical", "key": "10|doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a"}, "processingchargeamount": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "license": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "http://creativecommons.org/licenses/by-nc-nd/4.0/"}, "url": ["http://dx.doi.org/10.1016/j.nicl.2015.11.006"], "distributionlocation": "", "processingchargecurrency": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2016-01-01"}, "collectedfrom": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "BASE (Open Access Aggregator)", "key": "10|openaire____::df45502607927471ecf8a6ae83683ff5"}, "accessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0001", "classname": "Article", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}}], "embargoenddate": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "lastupdatetimestamp": 0, "author": [{"surname": "Zeuner", "name": "Kirsten E.", "pid": [], "rank": 1, "affiliation": [], "fullname": "Zeuner, Kirsten E."}, {"surname": "Knutzen", "name": "Arne", "pid": [], "rank": 2, "affiliation": [], "fullname": "Knutzen, Arne"}, {"surname": "Granert", "name": "Oliver", "pid": [{"qualifier": {"classid": "ORCID", "classname": "ORCID"}, "value": "0000-0002-0656-1023"}], "rank": 3, "affiliation": [], "fullname": "Granert, Oliver"}, {"surname": "Sablowsky", "name": "Simone", "pid": [], "rank": 4, "affiliation": [], "fullname": "Sablowsky, Simone"}, {"surname": "Go\\u0308tz", "name": "Julia", "pid": [], "rank": 5, "affiliation": [], "fullname": "G\\u00f6tz, Julia"}, {"surname": "Wolff", "name": "Stephan", "pid": [], "rank": 6, "affiliation": [], "fullname": "Wolff, Stephan"}, {"surname": "Jansen", "name": "Olav", "pid": [], "rank": 7, "affiliation": [], "fullname": "Jansen, Olav"}, {"surname": "Dressler", "name": "Dirk", "pid": [], "rank": 8, "affiliation": [], "fullname": "Dressler, Dirk"}, {"surname": "Schneider", "name": "Susanne A.", "pid": [], "rank": 9, "affiliation": [], "fullname": "Schneider, Susanne A."}, {"surname": "Klein", "name": "Christine", "pid": [], "rank": 10, "affiliation": [], "fullname": "Klein, Christine"}, {"surname": "Deuschl", "name": "Gu\\u0308nther", "pid": [], "rank": 11, "affiliation": [], "fullname": "Deuschl, G\\u00fcnther"}, {"surname": "Eimeren", "name": "Thilo", "pid": [], "rank": 12, "affiliation": [], "fullname": "van Eimeren, Thilo"}, {"surname": "Witt", "name": "Karsten", "pid": [], "rank": 13, "affiliation": [], "fullname": "Witt, Karsten"}], "source": [], "dateofcollection": "2017-07-27T19:04:09.131Z", "fulltext": [], "dateoftransformation": "2019-01-23T10:15:19.582Z", "description": [], "format": [], "journal": {"issnPrinted": "2213-1582", "conferencedate": "", "conferenceplace": "", "name": "NeuroImage: Clinical", "edition": "", "iss": "", "sp": "63", "vol": "10", "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "issnOnline": "", "ep": "70", "issnLinking": ""}, "coverage": [], "publisher": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "Elsevier BV"}, "language": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "bestaccessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "country": [{"classid": "IT", "classname": "Italy", "schemeid": "dnet:countries", "schemename": "dnet:countries"}], "extraInfo": [], "originalId": ["10.1016/j.nicl.2015.11.006"], "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2016-01-01"}, "title": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Altered brain activation in a reversal learning task unmasks adaptive changes in cognitive control in writer's cramp"}]} +{"context": [], "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive", "classname": "sysimport:crosswalk:datasetarchive", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "dedup-similarity-result-levenstein", "invisible": false, "trust": "0.9"}, "resourcetype": {"classid": "0004", "classname": "Conference object", "schemename": "dnet:dataCite_resource", "schemeid": "dnet:dataCite_resource"}, "pid": [], "contributor": [], "resulttype": {"classid": "publication", "classname": "publication", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "relevantdate": [], "collectedfrom": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "CRIS UNS (Current Research Information System University of Novi Sad)", "key": "10|CRIS_UNS____::f66f1bd369679b5b077dcdf006089556"}], "id": "50|CrisUnsNoviS::9f9d014eea45dab432cab636c4c9cf39", "subject": [], "instance": [{"refereed": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "hostedby": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "CRIS UNS (Current Research Information System University of Novi Sad)", "key": "10|CRIS_UNS____::f66f1bd369679b5b077dcdf006089556"}, "processingchargeamount": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "license": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "url": ["https://www.cris.uns.ac.rs/record.jsf?recordId=113444&source=OpenAIRE&language=en"], "distributionlocation": "", "processingchargecurrency": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2019-01-01"}, "collectedfrom": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "CRIS UNS (Current Research Information System University of Novi Sad)", "key": "10|CRIS_UNS____::f66f1bd369679b5b077dcdf006089556"}, "accessright": {"classid": "UNKNOWN", "classname": "UNKNOWN", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0004", "classname": "Conference object", "schemename": "dnet:dataCite_resource", "schemeid": "dnet:dataCite_resource"}}], "embargoenddate": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "lastupdatetimestamp": 0, "author": [{"surname": "Zeuner", "name": "Kirsten E.", "pid": [], "rank": 1, "affiliation": [], "fullname": "Zeuner, Kirsten E."}, {"surname": "Knutzen", "name": "Arne", "pid": [], "rank": 2, "affiliation": [], "fullname": "Knutzen, Arne"}, {"surname": "Granert", "name": "Oliver", "pid": [{"qualifier": {"classid": "ORCID", "classname": "ORCID"}, "value": "0000-0002-0656-1023"}, {"qualifier": {"classid": "pubmed", "classname": "pubmed"}, "value": "pubmed.it"}], "rank": 3, "affiliation": [], "fullname": "Granert, Oliver"}, {"surname": "Sablowsky", "name": "Simone", "pid": [{"qualifier": {"classid": "id", "classname": "id"}, "value": "12345678"}], "rank": 4, "affiliation": [], "fullname": "Sablowsky, Simone"}, {"surname": "Go\\u0308tz", "name": "Julia", "pid": [], "rank": 5, "affiliation": [], "fullname": "G\\u00f6tz, Julia"}, {"surname": "Wolff", "name": "Stephan", "pid": [], "rank": 6, "affiliation": [], "fullname": "Wolff, Stephan"}, {"surname": "Jansen", "name": "Olav", "pid": [{"qualifier": {"classid": "ORCID", "classname": "ORCID"}, "value": "0000-0000-0656-1023"},{"qualifier": {"classid": "id", "classname": "id"}, "value": "987654321"}], "rank": 7, "affiliation": [], "fullname": "Jansen, Olav"}, {"surname": "Dressler", "name": "Dirk", "pid": [], "rank": 8, "affiliation": [], "fullname": "Dressler, Dirk"}, {"surname": "Schneider", "name": "Susanne A.", "pid": [], "rank": 9, "affiliation": [], "fullname": "Schneider, Susanne A."}], "source": [], "dateofcollection": "2020-03-10T15:05:38.685Z", "fulltext": [], "dateoftransformation": "2020-03-11T20:11:13.15Z", "description": [], "format": [], "journal": {"issnPrinted": "", "conferencedate": "", "conferenceplace": "", "name": "", "edition": "", "iss": "", "sp": "", "vol": "", "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "issnOnline": "", "ep": "", "issnLinking": ""}, "coverage": [], "publisher": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "language": {"classid": "en", "classname": "en", "schemename": "dnet:languages", "schemeid": "dnet:languages"}, "bestaccessright": {"classid": "UNKNOWN", "classname": "not available", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "country": [{"classid": "FI", "classname": "Finland", "schemeid": "dnet:countries", "schemename": "dnet:countries"}], "extraInfo": [], "originalId": ["(BISIS)113444", "https://www.cris.uns.ac.rs/record.jsf?recordId=113444&source=OpenAIRE&language=en"], "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2016-01-01"}, "title": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Antichains of copies of ultrahomogeneous structures"}]} \ No newline at end of file From 3e345174790135695590409284747c42d6c9cbf3 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Thu, 21 May 2020 16:47:53 +0200 Subject: [PATCH 09/17] partial implementation of events with rels --- .../eu/dnetlib/dhp/broker/model/Topic.java | 69 ++++---- .../broker/oa/GenerateEventsApplication.java | 161 +++++++++++++++--- .../oa/matchers/EnrichMissingAbstract.java | 5 +- .../oa/matchers/EnrichMissingAuthorOrcid.java | 5 +- .../EnrichMissingDatasetIsReferencedBy.java | 38 +++++ .../EnrichMissingDatasetIsRelatedTo.java | 38 +++++ .../EnrichMissingDatasetIsSupplementedBy.java | 38 +++++ .../EnrichMissingDatasetIsSupplementedTo.java | 38 +++++ .../EnrichMissingDatasetReferences.java | 38 +++++ .../oa/matchers/EnrichMissingOpenAccess.java | 2 +- .../broker/oa/matchers/EnrichMissingPid.java | 2 +- .../oa/matchers/EnrichMissingProject.java | 21 ++- .../EnrichMissingPublicationDate.java | 2 +- ...nrichMissingPublicationIsReferencedBy.java | 42 +++++ .../EnrichMissingPublicationIsRelatedTo.java | 42 +++++ ...ichMissingPublicationIsSupplementedBy.java | 42 +++++ ...ichMissingPublicationIsSupplementedTo.java | 42 +++++ .../EnrichMissingPublicationReferences.java | 42 +++++ .../oa/matchers/EnrichMissingSoftware.java | 41 +++++ .../oa/matchers/EnrichMissingSubject.java | 2 +- .../oa/matchers/EnrichMoreOpenAccess.java | 2 +- .../dhp/broker/oa/matchers/EnrichMorePid.java | 2 +- .../broker/oa/matchers/EnrichMoreProject.java | 39 +++++ .../oa/matchers/EnrichMoreSoftware.java | 41 +++++ .../broker/oa/matchers/EnrichMoreSubject.java | 2 +- .../dhp/broker/oa/matchers/UpdateMatcher.java | 16 +- .../dhp/broker/oa/util/BrokerConstants.java | 2 + 27 files changed, 729 insertions(+), 85 deletions(-) create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsReferencedBy.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsRelatedTo.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsSupplementedBy.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsSupplementedTo.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetReferences.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsReferencedBy.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsRelatedTo.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsSupplementedBy.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsSupplementedTo.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationReferences.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSoftware.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreProject.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSoftware.java diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java index 98088dd0aa..0716bd98de 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java @@ -4,48 +4,45 @@ package eu.dnetlib.dhp.broker.model; public enum Topic { // ENRICHMENT MISSING - ENRICH_MISSING_OA_VERSION("ENRICH/MISSING/OPENACCESS_VERSION"), - ENRICH_MISSING_ABSTRACT("ENRICH/MISSING/ABSTRACT"), - ENRICH_MISSING_PUBLICATION_DATE("ENRICH/MISSING/PUBLICATION_DATE"), - ENRICH_MISSING_PID("ENRICH/MISSING/PID"), - ENRICH_MISSING_PROJECT("ENRICH/MISSING/PROJECT"), - ENRICH_MISSING_SOFTWARE("ENRICH/MISSING/SOFTWARE"), - ENRICH_MISSING_SUBJECT_MESHEUROPMC("ENRICH/MISSING/SUBJECT/MESHEUROPMC"), - ENRICH_MISSING_SUBJECT_ARXIV("ENRICH/MISSING/SUBJECT/ARXIV"), - ENRICH_MISSING_SUBJECT_JEL("ENRICH/MISSING/SUBJECT/JEL"), - ENRICH_MISSING_SUBJECT_DDC("ENRICH/MISSING/SUBJECT/DDC"), - ENRICH_MISSING_SUBJECT_ACM("ENRICH/MISSING/SUBJECT/ACM"), - ENRICH_MISSING_SUBJECT_RVK("ENRICH/MISSING/SUBJECT/RVK"), - ENRICH_MISSING_AUTHOR_ORCID("ENRICH/MISSING/AUTHOR/ORCID"), + ENRICH_MISSING_OA_VERSION("ENRICH/MISSING/OPENACCESS_VERSION"), ENRICH_MISSING_ABSTRACT( + "ENRICH/MISSING/ABSTRACT"), ENRICH_MISSING_PUBLICATION_DATE( + "ENRICH/MISSING/PUBLICATION_DATE"), ENRICH_MISSING_PID( + "ENRICH/MISSING/PID"), ENRICH_MISSING_PROJECT("ENRICH/MISSING/PROJECT"), ENRICH_MISSING_SOFTWARE( + "ENRICH/MISSING/SOFTWARE"), ENRICH_MISSING_SUBJECT_MESHEUROPMC( + "ENRICH/MISSING/SUBJECT/MESHEUROPMC"), ENRICH_MISSING_SUBJECT_ARXIV( + "ENRICH/MISSING/SUBJECT/ARXIV"), ENRICH_MISSING_SUBJECT_JEL( + "ENRICH/MISSING/SUBJECT/JEL"), ENRICH_MISSING_SUBJECT_DDC( + "ENRICH/MISSING/SUBJECT/DDC"), ENRICH_MISSING_SUBJECT_ACM( + "ENRICH/MISSING/SUBJECT/ACM"), ENRICH_MISSING_SUBJECT_RVK( + "ENRICH/MISSING/SUBJECT/RVK"), ENRICH_MISSING_AUTHOR_ORCID( + "ENRICH/MISSING/AUTHOR/ORCID"), // ENRICHMENT MORE - ENRICH_MORE_PID("ENRICH/MORE/PID"), - ENRICH_MORE_OA_VERSION("ENRICH/MORE/OPENACCESS_VERSION"), - ENRICH_MORE_ABSTRACT("ENRICH/MORE/ABSTRACT"), - ENRICH_MORE_PUBLICATION_DATE("ENRICH/MORE/PUBLICATION_DATE"), - ENRICH_MORE_PROJECT("ENRICH/MORE/PROJECT"), - ENRICH_MORE_SUBJECT_MESHEUROPMC("ENRICH/MORE/SUBJECT/MESHEUROPMC"), - ENRICH_MORE_SUBJECT_ARXIV("ENRICH/MORE/SUBJECT/ARXIV"), - ENRICH_MORE_SUBJECT_JEL("ENRICH/MORE/SUBJECT/JEL"), - ENRICH_MORE_SUBJECT_DDC("ENRICH/MORE/SUBJECT/DDC"), - ENRICH_MORE_SUBJECT_ACM("ENRICH/MORE/SUBJECT/ACM"), - ENRICH_MORE_SUBJECT_RVK("ENRICH/MORE/SUBJECT/RVK"), + ENRICH_MORE_PID("ENRICH/MORE/PID"), ENRICH_MORE_OA_VERSION("ENRICH/MORE/OPENACCESS_VERSION"), ENRICH_MORE_ABSTRACT( + "ENRICH/MORE/ABSTRACT"), ENRICH_MORE_PUBLICATION_DATE("ENRICH/MORE/PUBLICATION_DATE"), ENRICH_MORE_PROJECT( + "ENRICH/MORE/PROJECT"), ENRICH_MORE_SOFTWARE("ENRICH/MORE/SOFTWARE"), ENRICH_MORE_SUBJECT_MESHEUROPMC( + "ENRICH/MORE/SUBJECT/MESHEUROPMC"), ENRICH_MORE_SUBJECT_ARXIV( + "ENRICH/MORE/SUBJECT/ARXIV"), ENRICH_MORE_SUBJECT_JEL( + "ENRICH/MORE/SUBJECT/JEL"), ENRICH_MORE_SUBJECT_DDC( + "ENRICH/MORE/SUBJECT/DDC"), ENRICH_MORE_SUBJECT_ACM( + "ENRICH/MORE/SUBJECT/ACM"), ENRICH_MORE_SUBJECT_RVK("ENRICH/MORE/SUBJECT/RVK"), // ADDITION ADD_BY_PROJECT("ADD/BY_PROJECT"), // OTHER RELS - ENRICH_MISSING_PUBLICATION_IS_RELATED_TO("ENRICH/MISSING/PUBLICATION/IS_RELATED_TO"), - ENRICH_MISSING_PUBLICATION_REFERENCES("ENRICH/MISSING/PUBLICATION/REFERENCES"), - ENRICH_MISSING_PUBLICATION_IS_REFERENCED_BY("ENRICH/MISSING/PUBLICATION/IS_REFERENCED_BY"), - ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_TO("ENRICH/MISSING/PUBLICATION/IS_SUPPLEMENTED_TO"), - ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_BY("ENRICH/MISSING/PUBLICATION/IS_SUPPLEMENTED_BY"), + ENRICH_MISSING_PUBLICATION_IS_RELATED_TO( + "ENRICH/MISSING/PUBLICATION/IS_RELATED_TO"), ENRICH_MISSING_PUBLICATION_REFERENCES( + "ENRICH/MISSING/PUBLICATION/REFERENCES"), ENRICH_MISSING_PUBLICATION_IS_REFERENCED_BY( + "ENRICH/MISSING/PUBLICATION/IS_REFERENCED_BY"), ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_TO( + "ENRICH/MISSING/PUBLICATION/IS_SUPPLEMENTED_TO"), ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_BY( + "ENRICH/MISSING/PUBLICATION/IS_SUPPLEMENTED_BY"), - ENRICH_MISSING_DATASET_IS_RELATED_TO("ENRICH/MISSING/DATASET/IS_RELATED_TO"), - ENRICH_MISSING_DATASET_REFERENCES("ENRICH/MISSING/DATASET/REFERENCES"), - ENRICH_MISSING_DATASET_IS_REFERENCED_BY("ENRICH/MISSING/DATASET/IS_REFERENCED_BY"), - ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_TO("ENRICH/MISSING/DATASET/IS_SUPPLEMENTED_TO"), - ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_BY("ENRICH/MISSING/DATASET/IS_SUPPLEMENTED_BY"); + ENRICH_MISSING_DATASET_IS_RELATED_TO("ENRICH/MISSING/DATASET/IS_RELATED_TO"), ENRICH_MISSING_DATASET_REFERENCES( + "ENRICH/MISSING/DATASET/REFERENCES"), ENRICH_MISSING_DATASET_IS_REFERENCED_BY( + "ENRICH/MISSING/DATASET/IS_REFERENCED_BY"), ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_TO( + "ENRICH/MISSING/DATASET/IS_SUPPLEMENTED_TO"), ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_BY( + "ENRICH/MISSING/DATASET/IS_SUPPLEMENTED_BY"),; Topic(final String path) { this.path = path; @@ -59,7 +56,9 @@ public enum Topic { public static Topic fromPath(final String path) { for (final Topic t : Topic.values()) { - if (t.getPath().equals(path)) { return t; } + if (t.getPath().equals(path)) { + return t; + } } return null; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java index 5fdd109252..fa425a1819 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java @@ -4,11 +4,14 @@ package eu.dnetlib.dhp.broker.oa; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.util.ArrayList; +import java.util.Collection; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.tuple.Pair; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -29,18 +32,33 @@ import eu.dnetlib.dhp.broker.model.Event; import eu.dnetlib.dhp.broker.model.EventFactory; import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingAbstract; import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingAuthorOrcid; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingDatasetIsReferencedBy; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingDatasetIsRelatedTo; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingDatasetIsSupplementedBy; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingDatasetIsSupplementedTo; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingDatasetReferences; import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingOpenAccess; import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPid; import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingProject; import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationDate; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationIsReferencedBy; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationIsRelatedTo; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationIsSupplementedBy; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationIsSupplementedTo; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationReferences; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingSoftware; import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingSubject; import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreOpenAccess; import eu.dnetlib.dhp.broker.oa.matchers.EnrichMorePid; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreProject; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreSoftware; import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreSubject; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; @@ -50,24 +68,44 @@ public class GenerateEventsApplication { private static final Logger log = LoggerFactory.getLogger(GenerateEventsApplication.class); - private static final UpdateMatcher enrichMissingAbstract = new EnrichMissingAbstract(); - private static final UpdateMatcher enrichMissingAuthorOrcid = new EnrichMissingAuthorOrcid(); - private static final UpdateMatcher enrichMissingOpenAccess = new EnrichMissingOpenAccess(); - private static final UpdateMatcher enrichMissingPid = new EnrichMissingPid(); - private static final UpdateMatcher enrichMissingProject = new EnrichMissingProject(); - private static final UpdateMatcher enrichMissingPublicationDate = new EnrichMissingPublicationDate(); - private static final UpdateMatcher enrichMissingSubject = new EnrichMissingSubject(); - private static final UpdateMatcher enrichMoreOpenAccess = new EnrichMoreOpenAccess(); - private static final UpdateMatcher enrichMorePid = new EnrichMorePid(); - private static final UpdateMatcher enrichMoreSubject = new EnrichMoreSubject(); + // Simple Matchers + private static final UpdateMatcher enrichMissingAbstract = new EnrichMissingAbstract(); + private static final UpdateMatcher enrichMissingAuthorOrcid = new EnrichMissingAuthorOrcid(); + private static final UpdateMatcher enrichMissingOpenAccess = new EnrichMissingOpenAccess(); + private static final UpdateMatcher enrichMissingPid = new EnrichMissingPid(); + private static final UpdateMatcher enrichMissingPublicationDate = new EnrichMissingPublicationDate(); + private static final UpdateMatcher enrichMissingSubject = new EnrichMissingSubject(); + private static final UpdateMatcher enrichMoreOpenAccess = new EnrichMoreOpenAccess(); + private static final UpdateMatcher enrichMorePid = new EnrichMorePid(); + private static final UpdateMatcher enrichMoreSubject = new EnrichMoreSubject(); + + // Advanced matchers + private static final UpdateMatcher>, ?> enrichMissingProject = new EnrichMissingProject(); + private static final UpdateMatcher>, ?> enrichMoreProject = new EnrichMoreProject(); + + private static final UpdateMatcher>, ?> enrichMissingSoftware = new EnrichMissingSoftware(); + private static final UpdateMatcher>, ?> enrichMoreSoftware = new EnrichMoreSoftware(); + + private static final UpdateMatcher>, ?> enrichMisissingPublicationIsRelatedTo = new EnrichMissingPublicationIsRelatedTo(); + private static final UpdateMatcher>, ?> enrichMissingPublicationIsReferencedBy = new EnrichMissingPublicationIsReferencedBy(); + private static final UpdateMatcher>, ?> enrichMissingPublicationReferences = new EnrichMissingPublicationReferences(); + private static final UpdateMatcher>, ?> enrichMissingPublicationIsSupplementedTo = new EnrichMissingPublicationIsSupplementedTo(); + private static final UpdateMatcher>, ?> enrichMissingPublicationIsSupplementedBy = new EnrichMissingPublicationIsSupplementedBy(); + + private static final UpdateMatcher>, ?> enrichMisissingDatasetIsRelatedTo = new EnrichMissingDatasetIsRelatedTo(); + private static final UpdateMatcher>, ?> enrichMissingDatasetIsReferencedBy = new EnrichMissingDatasetIsReferencedBy(); + private static final UpdateMatcher>, ?> enrichMissingDatasetReferences = new EnrichMissingDatasetReferences(); + private static final UpdateMatcher>, ?> enrichMissingDatasetIsSupplementedTo = new EnrichMissingDatasetIsSupplementedTo(); + private static final UpdateMatcher>, ?> enrichMissingDatasetIsSupplementedBy = new EnrichMissingDatasetIsSupplementedBy(); public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils - .toString(GenerateEventsApplication.class - .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json"))); + .toString( + GenerateEventsApplication.class + .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json"))); parser.parseArgument(args); final Boolean isSparkSessionManaged = Optional @@ -82,9 +120,6 @@ public class GenerateEventsApplication { final String eventsPath = parser.get("eventsPath"); log.info("eventsPath: {}", eventsPath); - final String resultClassName = parser.get("resultTableName"); - log.info("resultTableName: {}", resultClassName); - final SparkConf conf = new SparkConf(); runWithSparkSession(conf, isSparkSessionManaged, spark -> { @@ -111,17 +146,17 @@ public class GenerateEventsApplication { final String graphPath, final Class resultClazz) { - final Dataset results = - readPath(spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), resultClazz) + final Dataset results = readPath( + spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), resultClazz) .filter(r -> r.getDataInfo().getDeletedbyinference()); - final Dataset rels = - readPath(spark, graphPath + "/relation", Relation.class) - .filter(r -> r.getRelClass().equals("TODO")); // TODO mergedIN + final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)); final Column c = null; // TODO - final Dataset aa = results.joinWith(rels, results.col("id").equalTo(rels.col("source")), "inner") + final Dataset aa = results + .joinWith(rels, results.col("id").equalTo(rels.col("source")), "inner") .groupBy(rels.col("target")) .agg(c) .filter(x -> x.size() > 1) @@ -134,7 +169,7 @@ public class GenerateEventsApplication { } - private List generateSimpleEvents(final Result... children) { + private List generateSimpleEvents(final Collection children) { final List> list = new ArrayList<>(); for (final Result target : children) { @@ -142,7 +177,6 @@ public class GenerateEventsApplication { list.addAll(enrichMissingAuthorOrcid.searchUpdatesForRecord(target, children)); list.addAll(enrichMissingOpenAccess.searchUpdatesForRecord(target, children)); list.addAll(enrichMissingPid.searchUpdatesForRecord(target, children)); - list.addAll(enrichMissingProject.searchUpdatesForRecord(target, children)); list.addAll(enrichMissingPublicationDate.searchUpdatesForRecord(target, children)); list.addAll(enrichMissingSubject.searchUpdatesForRecord(target, children)); list.addAll(enrichMoreOpenAccess.searchUpdatesForRecord(target, children)); @@ -153,6 +187,87 @@ public class GenerateEventsApplication { return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList()); } + private List generateProjectsEvents(final Collection>> childrenWithProjects) { + final List> list = new ArrayList<>(); + + for (final Pair> target : childrenWithProjects) { + list.addAll(enrichMissingProject.searchUpdatesForRecord(target, childrenWithProjects)); + list.addAll(enrichMoreProject.searchUpdatesForRecord(target, childrenWithProjects)); + } + + return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList()); + } + + private List generateSoftwareEvents(final Collection>> childrenWithSoftwares) { + final List> list = new ArrayList<>(); + + for (final Pair> target : childrenWithSoftwares) { + list.addAll(enrichMissingSoftware.searchUpdatesForRecord(target, childrenWithSoftwares)); + list.addAll(enrichMoreSoftware.searchUpdatesForRecord(target, childrenWithSoftwares)); + } + return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList()); + } + + private List generatePublicationRelatedEvents(final String relType, + final Collection>>> childrenWithRels) { + + final List> list = new ArrayList<>(); + + final List>> cleanedChildrens = childrenWithRels + .stream() + .filter(p -> p.getRight().containsKey(relType)) + .map(p -> Pair.of(p.getLeft(), p.getRight().get(relType))) + .filter(p -> p.getRight().size() > 0) + .collect(Collectors.toList()); + + for (final Pair> target : cleanedChildrens) { + if (relType.equals("isRelatedTo")) { + list.addAll(enrichMisissingPublicationIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens)); + } else if (relType.equals("references")) { + list.addAll(enrichMissingPublicationReferences.searchUpdatesForRecord(target, cleanedChildrens)); + } else if (relType.equals("isReferencedBy")) { + list.addAll(enrichMissingPublicationIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens)); + } else if (relType.equals("isSupplementedTo")) { + list.addAll(enrichMissingPublicationIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens)); + } else if (relType.equals("isSupplementedBy")) { + list.addAll(enrichMissingPublicationIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens)); + } + } + + return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList()); + + } + + private List generateDatasetRelatedEvents(final String relType, + final Collection>>> childrenWithRels) { + + final List> list = new ArrayList<>(); + + final List>> cleanedChildrens = childrenWithRels + .stream() + .filter(p -> p.getRight().containsKey(relType)) + .map(p -> Pair.of(p.getLeft(), p.getRight().get(relType))) + .filter(p -> p.getRight().size() > 0) + .collect(Collectors.toList()); + + for (final Pair> target : cleanedChildrens) { + if (relType.equals("isRelatedTo")) { + list.addAll(enrichMisissingDatasetIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens)); + } else if (relType.equals("references")) { + list.addAll(enrichMissingDatasetReferences.searchUpdatesForRecord(target, cleanedChildrens)); + } else if (relType.equals("isReferencedBy")) { + list.addAll(enrichMissingDatasetIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens)); + } else if (relType.equals("isSupplementedTo")) { + list.addAll(enrichMissingDatasetIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens)); + } else if (relType.equals("isSupplementedBy")) { + list.addAll(enrichMissingDatasetIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens)); + } + } + + return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList()); + + } + public static Dataset readPath( final SparkSession spark, final String inputPath, diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAbstract.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAbstract.java index 43cf738f8c..6dab6355fd 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAbstract.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAbstract.java @@ -9,7 +9,7 @@ import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingAbstract extends UpdateMatcher { +public class EnrichMissingAbstract extends UpdateMatcher { public EnrichMissingAbstract() { super(false); @@ -24,7 +24,8 @@ public class EnrichMissingAbstract extends UpdateMatcher { } @Override - public UpdateInfo generateUpdateInfo(final String highlightValue, final Result source, + public UpdateInfo generateUpdateInfo(final String highlightValue, + final Result source, final Result target) { return new UpdateInfo<>( Topic.ENRICH_MISSING_ABSTRACT, diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAuthorOrcid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAuthorOrcid.java index beeccdbe84..c7146ad79a 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAuthorOrcid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAuthorOrcid.java @@ -10,7 +10,7 @@ import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingAuthorOrcid extends UpdateMatcher> { +public class EnrichMissingAuthorOrcid extends UpdateMatcher> { public EnrichMissingAuthorOrcid() { super(true); @@ -24,7 +24,8 @@ public class EnrichMissingAuthorOrcid extends UpdateMatcher @Override public UpdateInfo> generateUpdateInfo(final Pair highlightValue, - final Result source, final Result target) { + final Result source, + final Result target) { return new UpdateInfo<>( Topic.ENRICH_MISSING_AUTHOR_ORCID, highlightValue, source, target, diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsReferencedBy.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsReferencedBy.java new file mode 100644 index 0000000000..3b9326fef2 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsReferencedBy.java @@ -0,0 +1,38 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingDatasetIsReferencedBy + extends UpdateMatcher>, eu.dnetlib.broker.objects.Dataset> { + + public EnrichMissingDatasetIsReferencedBy() { + super(true); + } + + @Override + protected List> findUpdates(final Pair> source, + final Pair> target) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Dataset highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_DATASET_IS_REFERENCED_BY, + highlightValue, source.getLeft(), target.getLeft(), + (p, rel) -> p.getDatasets().add(rel), + rel -> rel.getInstances().get(0).getUrl()); + } +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsRelatedTo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsRelatedTo.java new file mode 100644 index 0000000000..35f7c52b42 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsRelatedTo.java @@ -0,0 +1,38 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingDatasetIsRelatedTo + extends UpdateMatcher>, eu.dnetlib.broker.objects.Dataset> { + + public EnrichMissingDatasetIsRelatedTo() { + super(true); + } + + @Override + protected List> findUpdates(final Pair> source, + final Pair> target) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Dataset highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_DATASET_IS_RELATED_TO, + highlightValue, source.getLeft(), target.getLeft(), + (p, rel) -> p.getDatasets().add(rel), + rel -> rel.getInstances().get(0).getUrl()); + } +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsSupplementedBy.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsSupplementedBy.java new file mode 100644 index 0000000000..1faa305b52 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsSupplementedBy.java @@ -0,0 +1,38 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingDatasetIsSupplementedBy + extends UpdateMatcher>, eu.dnetlib.broker.objects.Dataset> { + + public EnrichMissingDatasetIsSupplementedBy() { + super(true); + } + + @Override + protected List> findUpdates(final Pair> source, + final Pair> target) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Dataset highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_BY, + highlightValue, source.getLeft(), target.getLeft(), + (p, rel) -> p.getDatasets().add(rel), + rel -> rel.getInstances().get(0).getUrl()); + } +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsSupplementedTo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsSupplementedTo.java new file mode 100644 index 0000000000..d1b067272d --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsSupplementedTo.java @@ -0,0 +1,38 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingDatasetIsSupplementedTo + extends UpdateMatcher>, eu.dnetlib.broker.objects.Dataset> { + + public EnrichMissingDatasetIsSupplementedTo() { + super(true); + } + + @Override + protected List> findUpdates(final Pair> source, + final Pair> target) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Dataset highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_TO, + highlightValue, source.getLeft(), target.getLeft(), + (p, rel) -> p.getDatasets().add(rel), + rel -> rel.getInstances().get(0).getUrl()); + } +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetReferences.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetReferences.java new file mode 100644 index 0000000000..ce6adeba27 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetReferences.java @@ -0,0 +1,38 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingDatasetReferences + extends UpdateMatcher>, eu.dnetlib.broker.objects.Dataset> { + + public EnrichMissingDatasetReferences() { + super(true); + } + + @Override + protected List> findUpdates(final Pair> source, + final Pair> target) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Dataset highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_DATASET_REFERENCES, + highlightValue, source.getLeft(), target.getLeft(), + (p, rel) -> p.getDatasets().add(rel), + rel -> rel.getInstances().get(0).getUrl()); + } +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingOpenAccess.java index a4a2ea0c6c..81263c6c3f 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingOpenAccess.java @@ -12,7 +12,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingOpenAccess extends UpdateMatcher { +public class EnrichMissingOpenAccess extends UpdateMatcher { public EnrichMissingOpenAccess() { super(true); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPid.java index a8df62541e..5f10bb4d9e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPid.java @@ -11,7 +11,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingPid extends UpdateMatcher { +public class EnrichMissingPid extends UpdateMatcher { public EnrichMissingPid() { super(true); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingProject.java index b6e5b3b574..0197c99b1b 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingProject.java @@ -4,30 +4,35 @@ package eu.dnetlib.dhp.broker.oa.matchers; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.Project; +import org.apache.commons.lang3.tuple.Pair; + import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingProject extends UpdateMatcher { +public class EnrichMissingProject + extends UpdateMatcher>, eu.dnetlib.broker.objects.Project> { public EnrichMissingProject() { super(true); } @Override - protected List> findUpdates(final Result source, final Result target) { - // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); + protected List> findUpdates(final Pair> source, + final Pair> target) { + // TODO return Arrays.asList(); } @Override - public UpdateInfo generateUpdateInfo(final Project highlightValue, - final Result source, - final Result target) { + public UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Project highlightValue, + final Pair> source, + final Pair> target) { return new UpdateInfo<>( Topic.ENRICH_MISSING_PROJECT, - highlightValue, source, target, + highlightValue, source.getLeft(), target.getLeft(), (p, prj) -> p.getProjects().add(prj), prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode()); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java index 372a4e4c9d..19ef2bab7e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java @@ -9,7 +9,7 @@ import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingPublicationDate extends UpdateMatcher { +public class EnrichMissingPublicationDate extends UpdateMatcher { public EnrichMissingPublicationDate() { super(false); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsReferencedBy.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsReferencedBy.java new file mode 100644 index 0000000000..8bcee5a1fc --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsReferencedBy.java @@ -0,0 +1,42 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingPublicationIsReferencedBy + extends UpdateMatcher>, eu.dnetlib.broker.objects.Publication> { + + public EnrichMissingPublicationIsReferencedBy() { + super(true); + } + + @Override + protected List> findUpdates( + final Pair> source, + final Pair> target) { + // TODO Auto-generated method stub + return Arrays.asList(); + } + + @Override + protected UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Publication highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_PUBLICATION_IS_REFERENCED_BY, + highlightValue, source.getLeft(), target.getLeft(), + (p, rel) -> { + }, // p.getPublications().add(rel), //TODO available in the future release of dnet-openaire-broker-common + rel -> rel.getOriginalId()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsRelatedTo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsRelatedTo.java new file mode 100644 index 0000000000..0c16f9f564 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsRelatedTo.java @@ -0,0 +1,42 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingPublicationIsRelatedTo + extends UpdateMatcher>, eu.dnetlib.broker.objects.Publication> { + + public EnrichMissingPublicationIsRelatedTo() { + super(true); + } + + @Override + protected List> findUpdates( + final Pair> source, + final Pair> target) { + // TODO Auto-generated method stub + return Arrays.asList(); + } + + @Override + protected UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Publication highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_PUBLICATION_IS_RELATED_TO, + highlightValue, source.getLeft(), target.getLeft(), + (p, rel) -> { + }, // p.getPublications().add(rel), //TODO available in the future release of dnet-openaire-broker-common + rel -> rel.getOriginalId()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsSupplementedBy.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsSupplementedBy.java new file mode 100644 index 0000000000..0b3c332708 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsSupplementedBy.java @@ -0,0 +1,42 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingPublicationIsSupplementedBy + extends UpdateMatcher>, eu.dnetlib.broker.objects.Publication> { + + public EnrichMissingPublicationIsSupplementedBy() { + super(true); + } + + @Override + protected List> findUpdates( + final Pair> source, + final Pair> target) { + // TODO Auto-generated method stub + return Arrays.asList(); + } + + @Override + protected UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Publication highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_BY, + highlightValue, source.getLeft(), target.getLeft(), + (p, rel) -> { + }, // p.getPublications().add(rel), //TODO available in the future release of dnet-openaire-broker-common + rel -> rel.getOriginalId()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsSupplementedTo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsSupplementedTo.java new file mode 100644 index 0000000000..0e72a84239 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsSupplementedTo.java @@ -0,0 +1,42 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingPublicationIsSupplementedTo + extends UpdateMatcher>, eu.dnetlib.broker.objects.Publication> { + + public EnrichMissingPublicationIsSupplementedTo() { + super(true); + } + + @Override + protected List> findUpdates( + final Pair> source, + final Pair> target) { + // TODO Auto-generated method stub + return Arrays.asList(); + } + + @Override + protected UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Publication highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_TO, + highlightValue, source.getLeft(), target.getLeft(), + (p, rel) -> { + }, // p.getPublications().add(rel), //TODO available in the future release of dnet-openaire-broker-common + rel -> rel.getOriginalId()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationReferences.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationReferences.java new file mode 100644 index 0000000000..6d1124974e --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationReferences.java @@ -0,0 +1,42 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingPublicationReferences + extends UpdateMatcher>, eu.dnetlib.broker.objects.Publication> { + + public EnrichMissingPublicationReferences() { + super(true); + } + + @Override + protected List> findUpdates( + final Pair> source, + final Pair> target) { + // TODO Auto-generated method stub + return Arrays.asList(); + } + + @Override + protected UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Publication highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_PUBLICATION_REFERENCES, + highlightValue, source.getLeft(), target.getLeft(), + (p, rel) -> { + }, // p.getPublications().add(rel), //TODO available in the future release of dnet-openaire-broker-common + rel -> rel.getOriginalId()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSoftware.java new file mode 100644 index 0000000000..954ee48be7 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSoftware.java @@ -0,0 +1,41 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.Software; + +public class EnrichMissingSoftware + extends UpdateMatcher>, eu.dnetlib.broker.objects.Software> { + + public EnrichMissingSoftware() { + super(true); + } + + @Override + protected List> findUpdates( + final Pair> source, + final Pair> target) { + // TODO + return Arrays.asList(); + } + + @Override + public UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Software highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_SOFTWARE, + highlightValue, source.getLeft(), target.getLeft(), + (p, s) -> p.getSoftwares().add(s), + s -> s.getName()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSubject.java index 79e9d469be..a7c72f6ea3 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSubject.java @@ -14,7 +14,7 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -public class EnrichMissingSubject extends UpdateMatcher> { +public class EnrichMissingSubject extends UpdateMatcher> { public EnrichMissingSubject() { super(true); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreOpenAccess.java index 40c9b0500a..2cd2775c9e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreOpenAccess.java @@ -12,7 +12,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMoreOpenAccess extends UpdateMatcher { +public class EnrichMoreOpenAccess extends UpdateMatcher { public EnrichMoreOpenAccess() { super(true); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMorePid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMorePid.java index 0e7b7766ab..048d19747d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMorePid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMorePid.java @@ -11,7 +11,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMorePid extends UpdateMatcher { +public class EnrichMorePid extends UpdateMatcher { public EnrichMorePid() { super(true); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreProject.java new file mode 100644 index 0000000000..4bf45d9437 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreProject.java @@ -0,0 +1,39 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMoreProject extends UpdateMatcher>, eu.dnetlib.broker.objects.Project> { + + public EnrichMoreProject() { + super(true); + } + + @Override + protected List> findUpdates(final Pair> source, + final Pair> target) { + // TODO + return Arrays.asList(); + } + + @Override + public UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Project highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MORE_PROJECT, + highlightValue, source.getLeft(), target.getLeft(), + (p, prj) -> p.getProjects().add(prj), + prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSoftware.java new file mode 100644 index 0000000000..9760504f64 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSoftware.java @@ -0,0 +1,41 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.Software; + +public class EnrichMoreSoftware + extends UpdateMatcher>, eu.dnetlib.broker.objects.Software> { + + public EnrichMoreSoftware() { + super(true); + } + + @Override + protected List> findUpdates( + final Pair> source, + final Pair> target) { + // TODO + return Arrays.asList(); + } + + @Override + public UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Software highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MORE_SOFTWARE, + highlightValue, source.getLeft(), target.getLeft(), + (p, s) -> p.getSoftwares().add(s), + s -> s.getName()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSubject.java index e6374479bf..67c2f01161 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSubject.java @@ -12,7 +12,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMoreSubject extends UpdateMatcher> { +public class EnrichMoreSubject extends UpdateMatcher> { public EnrichMoreSubject() { super(true); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java index d91b03200e..5bfe108a56 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java @@ -12,9 +12,8 @@ import org.apache.commons.lang3.StringUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Field; -import eu.dnetlib.dhp.schema.oaf.Result; -public abstract class UpdateMatcher { +public abstract class UpdateMatcher { private final boolean multipleUpdate; @@ -22,15 +21,16 @@ public abstract class UpdateMatcher { this.multipleUpdate = multipleUpdate; } - public Collection> searchUpdatesForRecord(final Result res, final Result... others) { + public Collection> searchUpdatesForRecord(final K res, final Collection others) { final Map> infoMap = new HashMap<>(); - for (final Result source : others) { + for (final K source : others) { if (source != res) { for (final UpdateInfo info : findUpdates(source, res)) { final String s = DigestUtils.md5Hex(info.getHighlightValueAsString()); - if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {} else { + if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) { + } else { infoMap.put(s, info); } } @@ -51,11 +51,11 @@ public abstract class UpdateMatcher { } } - protected abstract List> findUpdates(Result source, Result target); + protected abstract List> findUpdates(K source, K target); protected abstract UpdateInfo generateUpdateInfo(final T highlightValue, - final Result source, - final Result target); + final K source, + final K target); protected static boolean isMissing(final List> list) { return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0).getValue()); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java index d61d5bfb7d..8e97192bfb 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java @@ -4,4 +4,6 @@ package eu.dnetlib.dhp.broker.oa.util; public class BrokerConstants { public final static String OPEN_ACCESS = "OPEN"; + public final static String IS_MERGED_IN_CLASS = "isMergedIn"; + } From b33dd58be4dbaeb841363c61d95f3f9bab890d25 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 22 May 2020 08:50:06 +0200 Subject: [PATCH 10/17] replaced parameter 'reuseRecords' with 'resumeFrom', allowing to restart the provision workflow execution from any step, useful for manual submissions or debugging --- .../dhp/oa/provision/oozie_app/workflow.xml | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml index fd8f5ba89a..6983ecf53c 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml @@ -84,7 +84,6 @@ sparkNetworkTimeout configures spark.network.timeout - @@ -98,12 +97,16 @@ - + - + - ${wf:conf('reuseRecords') eq false} - ${wf:conf('reuseRecords') eq true} + ${wf:conf('resumeFrom') eq 'prepare_relations'} + ${wf:conf('resumeFrom') eq 'fork_join_related_entities'} + ${wf:conf('resumeFrom') eq 'join_all_entities'} + ${wf:conf('resumeFrom') eq 'adjancency_lists'} + ${wf:conf('resumeFrom') eq 'convert_to_xml'} + ${wf:conf('resumeFrom') eq 'to_solr_index'} @@ -131,9 +134,7 @@ --inputRelationsPath${inputGraphRootPath}/relation --outputPath${workingDir}/relation - --relPartitions${relPartitions} - --relationFilter${relationFilter} - --maxRelations${maxRelations} + --relPartitions3000 @@ -340,7 +341,6 @@ - yarn @@ -362,7 +362,7 @@ --inputGraphRootPath${inputGraphRootPath} --inputRelatedEntitiesPath${workingDir}/join_partial --outputPath${workingDir}/join_entities - --numPartitions24000 + --numPartitions12000 @@ -386,7 +386,7 @@ --conf spark.sql.shuffle.partitions=7680 --conf spark.network.timeout=${sparkNetworkTimeout} - --inputPath${workingDir}/join_entities + --inputPath ${workingDir}/join_entities --outputPath${workingDir}/joined From 925d933204529190205d67c0f580ef58975575e1 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 22 May 2020 08:50:44 +0200 Subject: [PATCH 11/17] making XmlRecordFactory immune to graph encoding changes (mostly to avoid NPEs) --- .../oa/provision/utils/TemplateFactory.java | 3 +- .../oa/provision/utils/XmlRecordFactory.java | 77 +++++++++++++++---- 2 files changed, 62 insertions(+), 18 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java index 6cb025b4fd..21b526ab1b 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java @@ -5,6 +5,7 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.removePrefix; import static eu.dnetlib.dhp.oa.provision.utils.XmlSerializationUtils.escapeXml; import java.io.IOException; +import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.stream.Collectors; @@ -95,7 +96,7 @@ public class TemplateFactory { .add("metadata", instancemetadata) .add( "webresources", - webresources + (webresources != null ? webresources : new ArrayList()) .stream() .filter(StringUtils::isNotBlank) .map(w -> getWebResource(w)) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java index ce1c71312e..6f042b45cf 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java @@ -174,6 +174,7 @@ public class XmlRecordFactory implements Serializable { entity .getCollectedfrom() .stream() + .filter(XmlRecordFactory::kvNotBlank) .map(kv -> XmlSerializationUtils.mapKeyValue("collectedfrom", kv)) .collect(Collectors.toList())); } @@ -183,6 +184,7 @@ public class XmlRecordFactory implements Serializable { entity .getOriginalId() .stream() + .filter(Objects::nonNull) .map(s -> XmlSerializationUtils.asXmlElement("originalId", s)) .collect(Collectors.toList())); } @@ -192,6 +194,7 @@ public class XmlRecordFactory implements Serializable { entity .getPid() .stream() + .filter(Objects::nonNull) .map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p)) .collect(Collectors.toList())); } @@ -213,6 +216,7 @@ public class XmlRecordFactory implements Serializable { r .getTitle() .stream() + .filter(Objects::nonNull) .map(t -> XmlSerializationUtils.mapStructuredProperty("title", t)) .collect(Collectors.toList())); } @@ -225,6 +229,7 @@ public class XmlRecordFactory implements Serializable { r .getAuthor() .stream() + .filter(Objects::nonNull) .map( a -> { final StringBuilder sb = new StringBuilder(" isNotBlank(sp.getQualifier().getClassid()) && isNotBlank(sp.getValue())) + .collect( + Collectors + .toMap( + p -> getAuthorPidType(p.getQualifier().getClassid()), + p -> p, + (p1, p2) -> p1)) + .values() .forEach( sp -> { - String pidType = XmlSerializationUtils - .escapeXml( - sp.getQualifier().getClassid()) - .replaceAll("\\W", ""); + String pidType = getAuthorPidType(sp.getQualifier().getClassid()); String pidValue = XmlSerializationUtils.escapeXml(sp.getValue()); - // ugly hack: some records - // provide swapped pidtype and - // pidvalue + // ugly hack: some records provide swapped pidtype and pidvalue if (authorPidTypes.contains(pidValue.toLowerCase().trim())) { sb.append(String.format(" %s=\"%s\"", pidValue, pidType)); } else { - pidType = pidType.replaceAll("\\W", "").replaceAll("\\d", ""); if (isNotBlank(pidType)) { sb .append( @@ -285,6 +292,7 @@ public class XmlRecordFactory implements Serializable { r .getContributor() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("contributor", c.getValue())) .collect(Collectors.toList())); } @@ -294,6 +302,7 @@ public class XmlRecordFactory implements Serializable { r .getCountry() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.mapQualifier("country", c)) .collect(Collectors.toList())); } @@ -303,6 +312,7 @@ public class XmlRecordFactory implements Serializable { r .getCoverage() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("coverage", c.getValue())) .collect(Collectors.toList())); } @@ -319,6 +329,7 @@ public class XmlRecordFactory implements Serializable { r .getDescription() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("description", c.getValue())) .collect(Collectors.toList())); } @@ -333,6 +344,7 @@ public class XmlRecordFactory implements Serializable { r .getSubject() .stream() + .filter(Objects::nonNull) .map(s -> XmlSerializationUtils.mapStructuredProperty("subject", s)) .collect(Collectors.toList())); } @@ -345,6 +357,7 @@ public class XmlRecordFactory implements Serializable { r .getRelevantdate() .stream() + .filter(Objects::nonNull) .map(s -> XmlSerializationUtils.mapStructuredProperty("relevantdate", s)) .collect(Collectors.toList())); } @@ -357,6 +370,7 @@ public class XmlRecordFactory implements Serializable { r .getSource() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("source", c.getValue())) .collect(Collectors.toList())); } @@ -366,6 +380,7 @@ public class XmlRecordFactory implements Serializable { r .getFormat() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("format", c.getValue())) .collect(Collectors.toList())); } @@ -429,6 +444,7 @@ public class XmlRecordFactory implements Serializable { orp .getContactperson() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("contactperson", c.getValue())) .collect(Collectors.toList())); } @@ -439,6 +455,7 @@ public class XmlRecordFactory implements Serializable { orp .getContactgroup() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("contactgroup", c.getValue())) .collect(Collectors.toList())); } @@ -448,6 +465,7 @@ public class XmlRecordFactory implements Serializable { orp .getTool() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("tool", c.getValue())) .collect(Collectors.toList())); } @@ -461,6 +479,7 @@ public class XmlRecordFactory implements Serializable { s .getDocumentationUrl() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("documentationUrl", c.getValue())) .collect(Collectors.toList())); } @@ -470,6 +489,7 @@ public class XmlRecordFactory implements Serializable { s .getLicense() .stream() + .filter(Objects::nonNull) .map(l -> XmlSerializationUtils.mapStructuredProperty("license", l)) .collect(Collectors.toList())); } @@ -576,6 +596,7 @@ public class XmlRecordFactory implements Serializable { ds .getOdlanguages() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("odlanguages", c.getValue())) .collect(Collectors.toList())); } @@ -585,6 +606,7 @@ public class XmlRecordFactory implements Serializable { ds .getOdcontenttypes() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("odcontenttypes", c.getValue())) .collect(Collectors.toList())); } @@ -697,6 +719,7 @@ public class XmlRecordFactory implements Serializable { ds .getPolicies() .stream() + .filter(XmlRecordFactory::kvNotBlank) .map(kv -> XmlSerializationUtils.mapKeyValue("policies", kv)) .collect(Collectors.toList())); } @@ -709,6 +732,7 @@ public class XmlRecordFactory implements Serializable { ds .getSubjects() .stream() + .filter(Objects::nonNull) .map(sp -> XmlSerializationUtils.mapStructuredProperty("subjects", sp)) .collect(Collectors.toList())); } @@ -735,6 +759,7 @@ public class XmlRecordFactory implements Serializable { o .getAlternativeNames() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("alternativeNames", c.getValue())) .collect(Collectors.toList())); } @@ -862,6 +887,7 @@ public class XmlRecordFactory implements Serializable { p .getSubjects() .stream() + .filter(Objects::nonNull) .map(sp -> XmlSerializationUtils.mapStructuredProperty("subject", sp)) .collect(Collectors.toList())); } @@ -912,7 +938,12 @@ public class XmlRecordFactory implements Serializable { if (p.getFundingtree() != null) { metadata .addAll( - p.getFundingtree().stream().map(ft -> ft.getValue()).collect(Collectors.toList())); + p + .getFundingtree() + .stream() + .filter(Objects::nonNull) + .map(ft -> ft.getValue()) + .collect(Collectors.toList())); } break; @@ -923,6 +954,17 @@ public class XmlRecordFactory implements Serializable { return metadata; } + private String getAuthorPidType(String s) { + return XmlSerializationUtils + .escapeXml(s) + .replaceAll("\\W", "") + .replaceAll("\\d", ""); + } + + private static boolean kvNotBlank(KeyValue kv) { + return kv != null && StringUtils.isNotBlank(kv.getKey()) && StringUtils.isNotBlank(kv.getValue()); + } + private void mapDatasourceType(List metadata, final Qualifier dsType) { metadata.add(XmlSerializationUtils.mapQualifier("datasourcetype", dsType)); @@ -960,7 +1002,7 @@ public class XmlRecordFactory implements Serializable { .add( XmlSerializationUtils.asXmlElement("coderepositoryurl", re.getCodeRepositoryUrl())); } - if (re.getResulttype() != null & re.getResulttype().isBlank()) { + if (re.getResulttype() != null && re.getResulttype().isBlank()) { metadata.add(XmlSerializationUtils.mapQualifier("resulttype", re.getResulttype())); } if (re.getCollectedfrom() != null) { @@ -969,6 +1011,7 @@ public class XmlRecordFactory implements Serializable { re .getCollectedfrom() .stream() + .filter(XmlRecordFactory::kvNotBlank) .map(kv -> XmlSerializationUtils.mapKeyValue("collectedfrom", kv)) .collect(Collectors.toList())); } @@ -986,10 +1029,10 @@ public class XmlRecordFactory implements Serializable { if (isNotBlank(re.getOfficialname())) { metadata.add(XmlSerializationUtils.asXmlElement("officialname", re.getOfficialname())); } - if (re.getDatasourcetype() != null & !re.getDatasourcetype().isBlank()) { + if (re.getDatasourcetype() != null && !re.getDatasourcetype().isBlank()) { mapDatasourceType(metadata, re.getDatasourcetype()); } - if (re.getOpenairecompatibility() != null & !re.getOpenairecompatibility().isBlank()) { + if (re.getOpenairecompatibility() != null && !re.getOpenairecompatibility().isBlank()) { metadata .add( XmlSerializationUtils @@ -1006,7 +1049,7 @@ public class XmlRecordFactory implements Serializable { .add( XmlSerializationUtils.asXmlElement("legalshortname", re.getLegalshortname())); } - if (re.getCountry() != null & !re.getCountry().isBlank()) { + if (re.getCountry() != null && !re.getCountry().isBlank()) { metadata.add(XmlSerializationUtils.mapQualifier("country", re.getCountry())); } break; @@ -1020,10 +1063,10 @@ public class XmlRecordFactory implements Serializable { if (isNotBlank(re.getAcronym())) { metadata.add(XmlSerializationUtils.asXmlElement("acronym", re.getAcronym())); } - if (re.getContracttype() != null & !re.getContracttype().isBlank()) { + if (re.getContracttype() != null && !re.getContracttype().isBlank()) { metadata.add(XmlSerializationUtils.mapQualifier("contracttype", re.getContracttype())); } - if (re.getFundingtree() != null & contexts != null) { + if (re.getFundingtree() != null && contexts != null) { metadata .addAll( re @@ -1091,12 +1134,12 @@ public class XmlRecordFactory implements Serializable { .add( XmlSerializationUtils.mapQualifier("accessright", instance.getAccessright())); } - if (instance.getCollectedfrom() != null) { + if (instance.getCollectedfrom() != null && kvNotBlank(instance.getCollectedfrom())) { fields .add( XmlSerializationUtils.mapKeyValue("collectedfrom", instance.getCollectedfrom())); } - if (instance.getHostedby() != null) { + if (instance.getHostedby() != null && kvNotBlank(instance.getHostedby())) { fields.add(XmlSerializationUtils.mapKeyValue("hostedby", instance.getHostedby())); } if (instance.getDateofacceptance() != null From c5f7e173481f7b761a56742067454f58ff0b4ace Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Fri, 22 May 2020 10:08:02 +0200 Subject: [PATCH 12/17] author fullnames --- .../dhp/oa/graph/raw/OdfToOafMapper.java | 126 ++++++++---------- 1 file changed, 55 insertions(+), 71 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 30b980c422..3a56aa8e0e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -4,16 +4,32 @@ package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; -import static eu.dnetlib.dhp.schema.common.ModelConstants.*; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_DATE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_RESOURCE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.HAS_PARTS; +import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PART_OF; +import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_SUPPLEMENTED_BY; +import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_SUPPLEMENT_TO; +import static eu.dnetlib.dhp.schema.common.ModelConstants.PART; +import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT; +import static eu.dnetlib.dhp.schema.common.ModelConstants.SUPPLEMENT; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.Node; import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson; -import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Field; @@ -22,7 +38,6 @@ import eu.dnetlib.dhp.schema.oaf.Instance; import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.Qualifier; -import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @@ -48,7 +63,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final String fullname = n.valueOf("./datacite:creatorName"); author.setFullname(fullname); - PacePerson pp = new PacePerson(fullname, false); + final PacePerson pp = new PacePerson(fullname, false); final String name = n.valueOf("./datacite:givenName"); if (StringUtils.isBlank(name) & pp.isAccurate()) { author.setName(pp.getNormalisedFirstName()); @@ -63,6 +78,10 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { author.setSurname(surname); } + if (StringUtils.isBlank(author.getFullname())) { + author.setFullname(String.format("%s, %s", author.getSurname(), author.getName())); + } + author.setAffiliation(prepareListFields(n, "./datacite:affiliation", info)); author.setPid(preparePids(n, info)); author.setRank(pos++); @@ -75,12 +94,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final List res = new ArrayList<>(); for (final Object o : n.selectNodes("./datacite:nameIdentifier")) { res - .add( - structuredProperty( - ((Node) o).getText(), - prepareQualifier( - (Node) o, "./@nameIdentifierScheme", DNET_PID_TYPES, DNET_PID_TYPES), - info)); + .add(structuredProperty(((Node) o).getText(), prepareQualifier((Node) o, "./@nameIdentifierScheme", DNET_PID_TYPES, DNET_PID_TYPES), info)); } return res; } @@ -94,22 +108,18 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final Instance instance = new Instance(); instance - .setInstancetype( - prepareQualifier( - doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); + .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); instance.setCollectedfrom(collectedfrom); instance.setHostedby(hostedby); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); instance - .setAccessright( - prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES, DNET_ACCESS_MODES)); + .setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES, DNET_ACCESS_MODES)); instance.setLicense(field(doc.valueOf("//oaf:license"), info)); instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); instance - .setProcessingchargecurrency( - field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); + .setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); final Set url = new HashSet<>(); for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) { @@ -147,14 +157,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { && !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Available")) { res - .add( - structuredProperty( - ((Node) o).getText(), - "UNKNOWN", - "UNKNOWN", - DNET_DATA_CITE_DATE, - DNET_DATA_CITE_DATE, - info)); + .add(structuredProperty(((Node) o).getText(), "UNKNOWN", "UNKNOWN", DNET_DATA_CITE_DATE, DNET_DATA_CITE_DATE, info)); } } return res; @@ -197,53 +200,49 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @Override protected List> prepareOtherResearchProductTools( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return new ArrayList<>(); // Not present in ODF ??? } @Override protected List> prepareOtherResearchProductContactGroups( - final Document doc, final DataInfo info) { - return prepareListFields( - doc, - "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", - info); + final Document doc, + final DataInfo info) { + return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info); } @Override protected List> prepareOtherResearchProductContactPersons( - final Document doc, final DataInfo info) { - return prepareListFields( - doc, - "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", - info); + final Document doc, + final DataInfo info) { + return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info); } @Override protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { - return prepareQualifier( - doc, "//datacite:format", "dnet:programming_languages", "dnet:programming_languages"); + return prepareQualifier(doc, "//datacite:format", "dnet:programming_languages", "dnet:programming_languages"); } @Override protected Field prepareSoftwareCodeRepositoryUrl( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return null; // Not present in ODF ??? } @Override protected List prepareSoftwareLicenses( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return new ArrayList<>(); // Not present in ODF ??? } @Override protected List> prepareSoftwareDocumentationUrls( - final Document doc, final DataInfo info) { - return prepareListFields( - doc, - "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", - info); + final Document doc, + final DataInfo info) { + return prepareListFields(doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info); } // DATASETS @@ -264,13 +263,15 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @Override protected Field prepareDatasetMetadataVersionNumber( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return null; // Not present in ODF ??? } @Override protected Field prepareDatasetLastMetadataUpdate( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return prepareField(doc, "//datacite:date[@dateType='Updated']", info); } @@ -315,29 +316,16 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { if (type.equalsIgnoreCase("IsSupplementTo")) { res - .add( - getRelation( - docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, collectedFrom, info, - lastUpdateTimestamp)); + .add(getRelation(docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, collectedFrom, info, lastUpdateTimestamp)); res - .add( - getRelation( - otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, collectedFrom, info, - lastUpdateTimestamp)); + .add(getRelation(otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, collectedFrom, info, lastUpdateTimestamp)); } else if (type.equals("IsPartOf")) { res - .add( - getRelation( - docId, otherId, RESULT_RESULT, PART, IS_PART_OF, collectedFrom, info, - lastUpdateTimestamp)); + .add(getRelation(docId, otherId, RESULT_RESULT, PART, IS_PART_OF, collectedFrom, info, lastUpdateTimestamp)); res - .add( - getRelation( - otherId, docId, RESULT_RESULT, PART, HAS_PARTS, collectedFrom, info, - lastUpdateTimestamp)); - } else { - } + .add(getRelation(otherId, docId, RESULT_RESULT, PART, HAS_PARTS, collectedFrom, info, lastUpdateTimestamp)); + } else {} } } return res; @@ -345,10 +333,6 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @Override protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { - return prepareQualifier( - doc, - "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", - DNET_DATA_CITE_RESOURCE, - DNET_DATA_CITE_RESOURCE); + return prepareQualifier(doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE, DNET_DATA_CITE_RESOURCE); } } From 9de71e54a8ae4f8bab8c78b5da4d790be711918e Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Fri, 22 May 2020 10:47:39 +0200 Subject: [PATCH 13/17] filter ORCID e MAG identifiers --- .../dhp/oa/graph/raw/OdfToOafMapper.java | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 3a56aa8e0e..1d4f80894e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; +import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_DATE; @@ -44,6 +45,9 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/"; + public static final Qualifier ORCID_PID_TYPE = qualifier("ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES); + public static final Qualifier MAG_PID_TYPE = qualifier("MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES); + public OdfToOafMapper(final Map code2name) { super(code2name); } @@ -93,8 +97,19 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { private List preparePids(final Node n, final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : n.selectNodes("./datacite:nameIdentifier")) { - res - .add(structuredProperty(((Node) o).getText(), prepareQualifier((Node) o, "./@nameIdentifierScheme", DNET_PID_TYPES, DNET_PID_TYPES), info)); + + final String id = ((Node) o).getText(); + final String type = ((Node) o).valueOf("./@nameIdentifierScheme") + .trim() + .toUpperCase() + .replaceAll(" ", ""); + + if (type.startsWith("ORCID")) { + final String cleanedId = id.replaceAll("http://orcid.org/", "").replaceAll("https://orcid.org/", ""); + res.add(structuredProperty(cleanedId, ORCID_PID_TYPE, info)); + } else if (type.startsWith("MAGID")) { + res.add(structuredProperty(id, MAG_PID_TYPE, info)); + } } return res; } From 9f2d0f1b0893ba8fdba8ceef27f631a83c2aaa15 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Fri, 22 May 2020 11:00:27 +0200 Subject: [PATCH 14/17] filter ORCID e MAG identifiers --- .../dhp/oa/graph/raw/OdfToOafMapper.java | 3 ++- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 23 ++++++++++++------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 1d4f80894e..d51433a182 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -102,7 +102,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final String type = ((Node) o).valueOf("./@nameIdentifierScheme") .trim() .toUpperCase() - .replaceAll(" ", ""); + .replaceAll(" ", "") + .replaceAll("_", ""); if (type.startsWith("ORCID")) { final String cleanedId = id.replaceAll("http://orcid.org/", "").replaceAll("https://orcid.org/", ""); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 5a006e3513..6ff76e839a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -21,7 +21,14 @@ import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Software; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; @ExtendWith(MockitoExtension.class) public class MappersTest { @@ -54,13 +61,13 @@ public class MappersTest { assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); assertTrue(p.getAuthor().size() > 0); - Optional author = p + final Optional author = p .getAuthor() .stream() .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) .findFirst(); assertTrue(author.isPresent()); - StructuredProperty pid = author + final StructuredProperty pid = author .get() .getPid() .stream() @@ -121,13 +128,13 @@ public class MappersTest { assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue())); assertTrue(d.getAuthor().size() > 0); - Optional author = d + final Optional author = d .getAuthor() .stream() .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) .findFirst(); assertTrue(author.isPresent()); - StructuredProperty pid = author + final StructuredProperty pid = author .get() .getPid() .stream() @@ -135,7 +142,7 @@ public class MappersTest { .get(); assertEquals("0000-0001-9074-1619", pid.getValue()); assertEquals("ORCID", pid.getQualifier().getClassid()); - assertEquals("ORCID", pid.getQualifier().getClassname()); + assertEquals("Open Researcher and Contributor ID", pid.getQualifier().getClassname()); assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid()); assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename()); assertEquals("Baracchini, Theo", author.get().getFullname()); @@ -143,13 +150,13 @@ public class MappersTest { assertEquals("Theo", author.get().getName()); assertEquals(1, author.get().getAffiliation().size()); - Optional> opAff = author + final Optional> opAff = author .get() .getAffiliation() .stream() .findFirst(); assertTrue(opAff.isPresent()); - Field affiliation = opAff.get(); + final Field affiliation = opAff.get(); assertEquals("ISTI-CNR", affiliation.getValue()); assertTrue(d.getSubject().size() > 0); From dc4621b3cb98ab01185e350e99f35091b628d0c5 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Fri, 22 May 2020 12:25:01 +0200 Subject: [PATCH 15/17] filter ORCID e MAG identifiers --- .../raw/AbstractMdRecordToOafMapper.java | 223 ++++++++---------- .../dhp/oa/graph/raw/OafToOafMapper.java | 113 +++++---- .../dhp/oa/graph/raw/OdfToOafMapper.java | 5 - .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 2 +- 4 files changed, 169 insertions(+), 174 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index b9c4e6c804..84b200c076 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -10,7 +10,16 @@ import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listFields; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.oaiIProvenance; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; -import static eu.dnetlib.dhp.schema.common.ModelConstants.*; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY; +import static eu.dnetlib.dhp.schema.common.ModelConstants.ORP_DEFAULT_RESULTTYPE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME; +import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.REPOSITORY_PROVENANCE_ACTIONS; +import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT; +import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE; import java.util.ArrayList; import java.util.Arrays; @@ -50,6 +59,8 @@ public abstract class AbstractMdRecordToOafMapper { protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4"; protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; + protected static final Qualifier ORCID_PID_TYPE = qualifier("ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES); + protected static final Qualifier MAG_PID_TYPE = qualifier("MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES); protected static final Map nsContext = new HashMap<>(); @@ -63,8 +74,7 @@ public abstract class AbstractMdRecordToOafMapper { nsContext.put("datacite", DATACITE_SCHEMA_KERNEL_3); } - protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier( - "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); + protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); protected AbstractMdRecordToOafMapper(final Map code2name) { this.code2name = code2name; @@ -75,24 +85,18 @@ public abstract class AbstractMdRecordToOafMapper { DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); final Document doc = DocumentHelper - .parseText( - xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)); + .parseText(xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)); final String type = doc.valueOf("//dr:CobjCategory/@type"); - final KeyValue collectedFrom = getProvenanceDatasource( - doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); + final KeyValue collectedFrom = getProvenanceDatasource(doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); - if (collectedFrom == null) { - return null; - } + if (collectedFrom == null) { return null; } final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) ? collectedFrom : getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name"); - if (hostedBy == null) { - return null; - } + if (hostedBy == null) { return null; } final DataInfo info = prepareDataInfo(doc); final long lastUpdateTimestamp = new Date().getTime(); @@ -103,17 +107,13 @@ public abstract class AbstractMdRecordToOafMapper { } } - private KeyValue getProvenanceDatasource(Document doc, String xpathId, String xpathName) { + private KeyValue getProvenanceDatasource(final Document doc, final String xpathId, final String xpathName) { final String dsId = doc.valueOf(xpathId); final String dsName = doc.valueOf(xpathName); - if (StringUtils.isBlank(dsId) | StringUtils.isBlank(dsName)) { - return null; - } + if (StringUtils.isBlank(dsId) | StringUtils.isBlank(dsName)) { return null; } - return keyValue( - createOpenaireId(10, dsId, true), - dsName); + return keyValue(createOpenaireId(10, dsId, true), dsName); } protected List createOafs( @@ -127,47 +127,47 @@ public abstract class AbstractMdRecordToOafMapper { final List oafs = new ArrayList<>(); switch (type.toLowerCase()) { - case "publication": - final Publication p = new Publication(); - populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - p.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE); - p.setJournal(prepareJournal(doc, info)); - oafs.add(p); - break; - case "dataset": - final Dataset d = new Dataset(); - populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - d.setResulttype(DATASET_DEFAULT_RESULTTYPE); - d.setStoragedate(prepareDatasetStorageDate(doc, info)); - d.setDevice(prepareDatasetDevice(doc, info)); - d.setSize(prepareDatasetSize(doc, info)); - d.setVersion(prepareDatasetVersion(doc, info)); - d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); - d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); - d.setGeolocation(prepareDatasetGeoLocations(doc, info)); - oafs.add(d); - break; - case "software": - final Software s = new Software(); - populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - s.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE); - s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); - s.setLicense(prepareSoftwareLicenses(doc, info)); - s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); - s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); - oafs.add(s); - break; - case "": - case "otherresearchproducts": - default: - final OtherResearchProduct o = new OtherResearchProduct(); - populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - o.setResulttype(ORP_DEFAULT_RESULTTYPE); - o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); - o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); - o.setTool(prepareOtherResearchProductTools(doc, info)); - oafs.add(o); - break; + case "publication": + final Publication p = new Publication(); + populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + p.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE); + p.setJournal(prepareJournal(doc, info)); + oafs.add(p); + break; + case "dataset": + final Dataset d = new Dataset(); + populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + d.setResulttype(DATASET_DEFAULT_RESULTTYPE); + d.setStoragedate(prepareDatasetStorageDate(doc, info)); + d.setDevice(prepareDatasetDevice(doc, info)); + d.setSize(prepareDatasetSize(doc, info)); + d.setVersion(prepareDatasetVersion(doc, info)); + d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); + d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); + d.setGeolocation(prepareDatasetGeoLocations(doc, info)); + oafs.add(d); + break; + case "software": + final Software s = new Software(); + populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + s.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE); + s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); + s.setLicense(prepareSoftwareLicenses(doc, info)); + s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); + s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); + oafs.add(s); + break; + case "": + case "otherresearchproducts": + default: + final OtherResearchProduct o = new OtherResearchProduct(); + populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + o.setResulttype(ORP_DEFAULT_RESULTTYPE); + o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); + o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); + o.setTool(prepareOtherResearchProductTools(doc, info)); + oafs.add(o); + break; } if (!oafs.isEmpty()) { @@ -196,23 +196,23 @@ public abstract class AbstractMdRecordToOafMapper { final String projectId = createOpenaireId(40, originalId, true); res - .add( - getRelation( - docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info, - lastUpdateTimestamp)); + .add(getRelation(docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info, lastUpdateTimestamp)); res - .add( - getRelation( - projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info, - lastUpdateTimestamp)); + .add(getRelation(projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info, lastUpdateTimestamp)); } } return res; } - protected Relation getRelation(String source, String target, String relType, String subRelType, String relClass, - KeyValue collectedFrom, DataInfo info, long lastUpdateTimestamp) { + protected Relation getRelation(final String source, + final String target, + final String relType, + final String subRelType, + final String relClass, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { final Relation rel = new Relation(); rel.setRelType(relType); rel.setSubRelType(subRelType); @@ -244,9 +244,7 @@ public abstract class AbstractMdRecordToOafMapper { r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); r.setCollectedfrom(Arrays.asList(collectedFrom)); r - .setPid( - prepareListStructProps( - doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); + .setPid(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation")); r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES @@ -289,7 +287,10 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); protected abstract List prepareInstances( - Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); + Document doc, + DataInfo info, + KeyValue collectedfrom, + KeyValue hostedby); protected abstract List> prepareSources(Document doc, DataInfo info); @@ -314,13 +315,16 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract List prepareAuthors(Document doc, DataInfo info); protected abstract List> prepareOtherResearchProductTools( - Document doc, DataInfo info); + Document doc, + DataInfo info); protected abstract List> prepareOtherResearchProductContactGroups( - Document doc, DataInfo info); + Document doc, + DataInfo info); protected abstract List> prepareOtherResearchProductContactPersons( - Document doc, DataInfo info); + Document doc, + DataInfo info); protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info); @@ -329,7 +333,8 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract List prepareSoftwareLicenses(Document doc, DataInfo info); protected abstract List> prepareSoftwareDocumentationUrls( - Document doc, DataInfo info); + Document doc, + DataInfo info); protected abstract List prepareDatasetGeoLocations(Document doc, DataInfo info); @@ -357,27 +362,16 @@ public abstract class AbstractMdRecordToOafMapper { final String sp = n.valueOf("@sp"); final String vol = n.valueOf("@vol"); final String edition = n.valueOf("@edition"); - if (StringUtils.isNotBlank(name)) { - return journal( - name, - issnPrinted, - issnOnline, - issnLinking, - ep, - iss, - sp, - vol, - edition, - null, - null, - info); - } + if (StringUtils.isNotBlank(name)) { return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); } } return null; } protected Qualifier prepareQualifier( - final Node node, final String xpath, final String schemeId, final String schemeName) { + final Node node, + final String xpath, + final String schemeId, + final String schemeName) { final String classId = node.valueOf(xpath); final String className = code2name.get(classId); return qualifier(classId, className, schemeId, schemeName); @@ -401,7 +395,10 @@ public abstract class AbstractMdRecordToOafMapper { } protected List prepareListStructProps( - final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) { + final Node node, + final String xpath, + final Qualifier qualifier, + final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; @@ -411,19 +408,14 @@ public abstract class AbstractMdRecordToOafMapper { } protected List prepareListStructProps( - final Node node, final String xpath, final DataInfo info) { + final Node node, + final String xpath, + final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; res - .add( - structuredProperty( - n.getText(), - n.valueOf("@classid"), - n.valueOf("@classname"), - n.valueOf("@schemeid"), - n.valueOf("@schemename"), - info)); + .add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n.valueOf("@schemename"), info)); } return res; } @@ -431,9 +423,7 @@ public abstract class AbstractMdRecordToOafMapper { protected OAIProvenance prepareOAIprovenance(final Document doc) { final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); - if (n == null) { - return null; - } + if (n == null) { return null; } final String identifier = n.valueOf("./*[local-name()='identifier']"); final String baseURL = n.valueOf("./*[local-name()='baseURL']"); @@ -448,10 +438,7 @@ public abstract class AbstractMdRecordToOafMapper { protected DataInfo prepareDataInfo(final Document doc) { final Node n = doc.selectSingleNode("//oaf:datainfo"); - if (n == null) { - return dataInfo( - false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); - } + if (n == null) { return dataInfo(false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); } final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); @@ -463,13 +450,7 @@ public abstract class AbstractMdRecordToOafMapper { final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); final String trust = n.valueOf("./oaf:trust"); - return dataInfo( - deletedbyinference, - inferenceprovenance, - inferred, - false, - qualifier(paClassId, paClassName, paSchemeId, paSchemeName), - trust); + return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); } protected Field prepareField(final Node node, final String xpath, final DataInfo info) { @@ -477,7 +458,9 @@ public abstract class AbstractMdRecordToOafMapper { } protected List> prepareListFields( - final Node node, final String xpath, final DataInfo info) { + final Node node, + final String xpath, + final DataInfo info) { return listFields(info, prepareListString(node, xpath)); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index ed09016da8..24a8b45270 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -1,10 +1,19 @@ package eu.dnetlib.dhp.oa.graph.raw; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*; -import static eu.dnetlib.dhp.schema.common.ModelConstants.*; +import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; +import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; +import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO; +import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DATASET; +import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT; -import java.util.*; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; @@ -15,8 +24,15 @@ import org.dom4j.Node; import com.google.common.collect.Lists; import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson; -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.GeoLocation; +import eu.dnetlib.dhp.schema.oaf.Instance; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; public class OafToOafMapper extends AbstractMdRecordToOafMapper { @@ -39,14 +55,22 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { author.setSurname(p.getNormalisedSurname()); } - final String pid = e.attributeValue("nameIdentifier"); - final String pidType = e.attributeValue("nameIdentifierScheme"); + final String pid = e.valueOf("./@nameIdentifier"); + final String type = e.valueOf("./@nameIdentifierScheme") + .trim() + .toUpperCase() + .replaceAll(" ", "") + .replaceAll("_", ""); author.setPid(new ArrayList<>()); - if (StringUtils.isNotBlank(pid) && StringUtils.isNotBlank(pidType)) { - author - .getPid() - .add(structuredProperty(pid, qualifier(pidType, pidType, DNET_PID_TYPES, DNET_PID_TYPES), info)); + + if (StringUtils.isNotBlank(pid)) { + if (type.startsWith("ORCID")) { + final String cleanedId = pid.replaceAll("http://orcid.org/", "").replaceAll("https://orcid.org/", ""); + author.getPid().add(structuredProperty(cleanedId, ORCID_PID_TYPE, info)); + } else if (type.startsWith("MAGID")) { + author.getPid().add(structuredProperty(pid, MAG_PID_TYPE, info)); + } } res.add(author); @@ -103,38 +127,29 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final Instance instance = new Instance(); instance - .setInstancetype( - prepareQualifier( - doc, - "//dr:CobjCategory", - DNET_PUBLICATION_RESOURCE, - DNET_PUBLICATION_RESOURCE)); + .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); instance.setCollectedfrom(collectedfrom); instance.setHostedby(hostedby); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); instance - .setAccessright( - prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES, DNET_ACCESS_MODES)); + .setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES, DNET_ACCESS_MODES)); instance.setLicense(field(doc.valueOf("//oaf:license"), info)); instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); instance - .setProcessingchargeamount( - field(doc.valueOf("//oaf:processingchargeamount"), info)); + .setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); instance - .setProcessingchargecurrency( - field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); + .setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); - List nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier")); + final List nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier")); instance - .setUrl( - nodes - .stream() - .filter(n -> StringUtils.isNotBlank(n.getText())) - .map(n -> n.getText().trim()) - .filter(u -> u.startsWith("http")) - .distinct() - .collect(Collectors.toCollection(ArrayList::new))); + .setUrl(nodes + .stream() + .filter(n -> StringUtils.isNotBlank(n.getText())) + .map(n -> n.getText().trim()) + .filter(u -> u.startsWith("http")) + .distinct() + .collect(Collectors.toCollection(ArrayList::new))); return Lists.newArrayList(instance); } @@ -158,19 +173,22 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { @Override protected Field prepareSoftwareCodeRepositoryUrl( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return null; // NOT PRESENT IN OAF } @Override protected List prepareSoftwareLicenses( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return new ArrayList<>(); // NOT PRESENT IN OAF } @Override protected List> prepareSoftwareDocumentationUrls( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return new ArrayList<>(); // NOT PRESENT IN OAF } @@ -182,13 +200,15 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { @Override protected Field prepareDatasetMetadataVersionNumber( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return null; // NOT PRESENT IN OAF } @Override protected Field prepareDatasetLastMetadataUpdate( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return null; // NOT PRESENT IN OAF } @@ -216,19 +236,22 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { @Override protected List> prepareOtherResearchProductTools( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return new ArrayList<>(); // NOT PRESENT IN OAF } @Override protected List> prepareOtherResearchProductContactGroups( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return new ArrayList<>(); // NOT PRESENT IN OAF } @Override protected List> prepareOtherResearchProductContactPersons( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return new ArrayList<>(); // NOT PRESENT IN OAF } @@ -251,15 +274,9 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final String otherId = createOpenaireId(50, originalId, false); res - .add( - getRelation( - docId, otherId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, - lastUpdateTimestamp)); + .add(getRelation(docId, otherId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, lastUpdateTimestamp)); res - .add( - getRelation( - otherId, docId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, - lastUpdateTimestamp)); + .add(getRelation(otherId, docId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, lastUpdateTimestamp)); } } return res; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index d51433a182..e7edfb2487 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -3,13 +3,11 @@ package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_DATE; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_RESOURCE; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES; -import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE; import static eu.dnetlib.dhp.schema.common.ModelConstants.HAS_PARTS; import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PART_OF; @@ -45,9 +43,6 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/"; - public static final Qualifier ORCID_PID_TYPE = qualifier("ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES); - public static final Qualifier MAG_PID_TYPE = qualifier("MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES); - public OdfToOafMapper(final Map code2name) { super(code2name); } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 6ff76e839a..631e7235ec 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -75,7 +75,7 @@ public class MappersTest { .get(); assertEquals("0000-0001-6651-1178", pid.getValue()); assertEquals("ORCID", pid.getQualifier().getClassid()); - assertEquals("ORCID", pid.getQualifier().getClassname()); + assertEquals("Open Researcher and Contributor ID", pid.getQualifier().getClassname()); assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid()); assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename()); assertEquals("Votsi,Nefta", author.get().getFullname()); From 3cf2796ac6a945fe63377b3b5c062bed5af21014 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 22 May 2020 12:34:00 +0200 Subject: [PATCH 16/17] code formatting --- .../eu/dnetlib/dhp/oa/dedup/AuthorMerger.java | 259 +++++++++--------- .../dhp/oa/dedup/DedupRecordFactory.java | 8 +- .../dhp/oa/dedup/EntityMergerTest.java | 69 ++--- .../raw/AbstractMdRecordToOafMapper.java | 141 ++++++---- .../dhp/oa/graph/raw/OafToOafMapper.java | 35 ++- .../dhp/oa/graph/raw/OdfToOafMapper.java | 47 +++- 6 files changed, 315 insertions(+), 244 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java index 108f4a4bec..43df19f8a4 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java @@ -1,159 +1,164 @@ -package eu.dnetlib.dhp.oa.dedup; -import com.wcohen.ss.JaroWinkler; -import eu.dnetlib.dhp.schema.oaf.Author; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import eu.dnetlib.pace.model.Person; -import org.apache.commons.lang3.StringUtils; -import scala.Tuple2; +package eu.dnetlib.dhp.oa.dedup; import java.text.Normalizer; import java.util.*; import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; + +import com.wcohen.ss.JaroWinkler; + +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.pace.model.Person; +import scala.Tuple2; + public class AuthorMerger { - private static final Double THRESHOLD = 0.95; + private static final Double THRESHOLD = 0.95; - public static List merge(List> authors){ + public static List merge(List> authors) { - authors.sort(new Comparator>() { - @Override - public int compare(List o1, List o2) { - return -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)); - } - }); + authors.sort(new Comparator>() { + @Override + public int compare(List o1, List o2) { + return -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)); + } + }); - List author = new ArrayList<>(); + List author = new ArrayList<>(); - for(List a : authors){ - author = mergeAuthor(author, a); - } + for (List a : authors) { + author = mergeAuthor(author, a); + } - return author; + return author; - } + } - public static List mergeAuthor(final List a, final List b) { - int pa = countAuthorsPids(a); - int pb = countAuthorsPids(b); - List base, enrich; - int sa = authorsSize(a); - int sb = authorsSize(b); + public static List mergeAuthor(final List a, final List b) { + int pa = countAuthorsPids(a); + int pb = countAuthorsPids(b); + List base, enrich; + int sa = authorsSize(a); + int sb = authorsSize(b); - if (pa == pb) { - base = sa > sb ? a : b; - enrich = sa > sb ? b : a; - } else { - base = pa > pb ? a : b; - enrich = pa > pb ? b : a; - } - enrichPidFromList(base, enrich); - return base; - } + if (pa == pb) { + base = sa > sb ? a : b; + enrich = sa > sb ? b : a; + } else { + base = pa > pb ? a : b; + enrich = pa > pb ? b : a; + } + enrichPidFromList(base, enrich); + return base; + } - private static void enrichPidFromList(List base, List enrich) { - if (base == null || enrich == null) - return; - final Map basePidAuthorMap = base - .stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap( - a -> a - .getPid() - .stream() - .map(p -> new Tuple2<>(pidToComparableString(p), a))) - .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); + private static void enrichPidFromList(List base, List enrich) { + if (base == null || enrich == null) + return; + final Map basePidAuthorMap = base + .stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap( + a -> a + .getPid() + .stream() + .map(p -> new Tuple2<>(pidToComparableString(p), a))) + .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); - final List> pidToEnrich = enrich - .stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap( - a -> a - .getPid() - .stream() - .filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p))) - .map(p -> new Tuple2<>(p, a))) - .collect(Collectors.toList()); + final List> pidToEnrich = enrich + .stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap( + a -> a + .getPid() + .stream() + .filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p))) + .map(p -> new Tuple2<>(p, a))) + .collect(Collectors.toList()); - pidToEnrich - .forEach( - a -> { - Optional> simAuthor = base - .stream() - .map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) - .max(Comparator.comparing(Tuple2::_1)); - if (simAuthor.isPresent() && simAuthor.get()._1() > THRESHOLD) { - Author r = simAuthor.get()._2(); - if (r.getPid() == null) { - r.setPid(new ArrayList<>()); - } - r.getPid().add(a._1()); - } - }); - } + pidToEnrich + .forEach( + a -> { + Optional> simAuthor = base + .stream() + .map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) + .max(Comparator.comparing(Tuple2::_1)); + if (simAuthor.isPresent() && simAuthor.get()._1() > THRESHOLD) { + Author r = simAuthor.get()._2(); + if (r.getPid() == null) { + r.setPid(new ArrayList<>()); + } + r.getPid().add(a._1()); + } + }); + } - public static String pidToComparableString(StructuredProperty pid){ - return (pid.getQualifier()!=null? pid.getQualifier().getClassid()!=null?pid.getQualifier().getClassid().toLowerCase():"":"") + (pid.getValue()!=null? pid.getValue().toLowerCase():""); - } + public static String pidToComparableString(StructuredProperty pid) { + return (pid.getQualifier() != null + ? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : "" + : "") + (pid.getValue() != null ? pid.getValue().toLowerCase() : ""); + } - public static int countAuthorsPids(List authors) { - if (authors == null) - return 0; + public static int countAuthorsPids(List authors) { + if (authors == null) + return 0; - return (int) authors.stream().filter(AuthorMerger::hasPid).count(); - } + return (int) authors.stream().filter(AuthorMerger::hasPid).count(); + } - private static int authorsSize(List authors) { - if (authors == null) - return 0; - return authors.size(); - } + private static int authorsSize(List authors) { + if (authors == null) + return 0; + return authors.size(); + } - private static Double sim(Author a, Author b) { + private static Double sim(Author a, Author b) { - final Person pa = parse(a); - final Person pb = parse(b); + final Person pa = parse(a); + final Person pb = parse(b); - if (pa.isAccurate() & pb.isAccurate()) { - return new JaroWinkler() - .score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())); - } else { - return new JaroWinkler() - .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname())); - } - } + if (pa.isAccurate() & pb.isAccurate()) { + return new JaroWinkler() + .score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())); + } else { + return new JaroWinkler() + .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname())); + } + } - private static boolean hasPid(Author a) { - if (a == null || a.getPid() == null || a.getPid().size() == 0) - return false; - return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); - } + private static boolean hasPid(Author a) { + if (a == null || a.getPid() == null || a.getPid().size() == 0) + return false; + return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); + } - private static Person parse(Author author) { - if (StringUtils.isNotBlank(author.getSurname())) { - return new Person(author.getSurname() + ", " + author.getName(), false); - } else { - return new Person(author.getFullname(), false); - } - } + private static Person parse(Author author) { + if (StringUtils.isNotBlank(author.getSurname())) { + return new Person(author.getSurname() + ", " + author.getName(), false); + } else { + return new Person(author.getFullname(), false); + } + } - private static String normalize(final String s) { - return nfd(s) - .toLowerCase() - // do not compact the regexes in a single expression, would cause StackOverflowError - // in case - // of large input strings - .replaceAll("(\\W)+", " ") - .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") - .replaceAll("(\\p{Punct})+", " ") - .replaceAll("(\\d)+", " ") - .replaceAll("(\\n)+", " ") - .trim(); - } + private static String normalize(final String s) { + return nfd(s) + .toLowerCase() + // do not compact the regexes in a single expression, would cause StackOverflowError + // in case + // of large input strings + .replaceAll("(\\W)+", " ") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim(); + } - private static String nfd(final String s) { - return Normalizer.normalize(s, Normalizer.Form.NFD); - } + private static String nfd(final String s) { + return Normalizer.normalize(s, Normalizer.Form.NFD); + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java index eed783e536..8028d5a94f 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.oa.dedup; import java.io.Serializable; @@ -73,7 +74,8 @@ public class DedupRecordFactory { } public static T entityMerger( - String id, Iterator> entities, long ts, DataInfo dataInfo, Class clazz) throws IllegalAccessException, InstantiationException { + String id, Iterator> entities, long ts, DataInfo dataInfo, Class clazz) + throws IllegalAccessException, InstantiationException { T entity = clazz.newInstance(); @@ -87,14 +89,14 @@ public class DedupRecordFactory { entity.mergeFrom(duplicate); if (ModelSupport.isSubClass(duplicate, Result.class)) { Result r1 = (Result) duplicate; - if (r1.getAuthor() != null && r1.getAuthor().size()>0) + if (r1.getAuthor() != null && r1.getAuthor().size() > 0) authors.add(r1.getAuthor()); if (r1.getDateofacceptance() != null) dates.add(r1.getDateofacceptance().getValue()); } }); - //set authors and date + // set authors and date if (ModelSupport.isSubClass(entity, Result.class)) { ((Result) entity).setDateofacceptance(DatePicker.pick(dates)); ((Result) entity).setAuthor(AuthorMerger.merge(authors)); diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java index 526dd73bb0..0a3bf62ea8 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java @@ -1,5 +1,8 @@ + package eu.dnetlib.dhp.oa.dedup; +import static org.junit.jupiter.api.Assertions.assertEquals; + import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; @@ -7,15 +10,13 @@ import java.io.Serializable; import java.nio.file.Paths; import java.util.*; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.pace.util.MapDocumentUtil; import org.codehaus.jackson.map.ObjectMapper; import org.junit.jupiter.api.BeforeEach; - import org.junit.jupiter.api.Test; -import scala.Tuple2; -import static org.junit.jupiter.api.Assertions.assertEquals; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.pace.util.MapDocumentUtil; +import scala.Tuple2; public class EntityMergerTest implements Serializable { @@ -30,9 +31,9 @@ public class EntityMergerTest implements Serializable { public void setUp() throws Exception { testEntityBasePath = Paths - .get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI()) - .toFile() - .getAbsolutePath(); + .get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI()) + .toFile() + .getAbsolutePath(); publications = readSample(testEntityBasePath + "/publication_merge.json", Publication.class); @@ -45,7 +46,8 @@ public class EntityMergerTest implements Serializable { @Test public void publicationMergerTest() throws InstantiationException, IllegalAccessException { - Publication pub_merged = DedupRecordFactory.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class); + Publication pub_merged = DedupRecordFactory + .entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class); assertEquals(dedupId, pub_merged.getId()); @@ -59,36 +61,36 @@ public class EntityMergerTest implements Serializable { assertEquals(pub_merged.getDateoftransformation(), pub_top.getDateoftransformation()); assertEquals(pub_merged.getOaiprovenance(), pub_top.getOaiprovenance()); assertEquals(pub_merged.getDateofcollection(), pub_top.getDateofcollection()); - assertEquals(pub_merged.getInstance().size(),3); + assertEquals(pub_merged.getInstance().size(), 3); assertEquals(pub_merged.getCountry().size(), 2); assertEquals(pub_merged.getSubject().size(), 0); assertEquals(pub_merged.getTitle().size(), 2); - assertEquals(pub_merged.getRelevantdate().size(),0); - assertEquals(pub_merged.getDescription().size(),0); - assertEquals(pub_merged.getSource().size(),0); - assertEquals(pub_merged.getFulltext().size(),0); - assertEquals(pub_merged.getFormat().size(),0); - assertEquals(pub_merged.getContributor().size(),0); - assertEquals(pub_merged.getCoverage().size(),0); - assertEquals(pub_merged.getContext().size(),0); - assertEquals(pub_merged.getExternalReference().size(),0); - assertEquals(pub_merged.getOriginalId().size(),3); - assertEquals(pub_merged.getCollectedfrom().size(),3); - assertEquals(pub_merged.getPid().size(),1); - assertEquals(pub_merged.getExtraInfo().size(),0); + assertEquals(pub_merged.getRelevantdate().size(), 0); + assertEquals(pub_merged.getDescription().size(), 0); + assertEquals(pub_merged.getSource().size(), 0); + assertEquals(pub_merged.getFulltext().size(), 0); + assertEquals(pub_merged.getFormat().size(), 0); + assertEquals(pub_merged.getContributor().size(), 0); + assertEquals(pub_merged.getCoverage().size(), 0); + assertEquals(pub_merged.getContext().size(), 0); + assertEquals(pub_merged.getExternalReference().size(), 0); + assertEquals(pub_merged.getOriginalId().size(), 3); + assertEquals(pub_merged.getCollectedfrom().size(), 3); + assertEquals(pub_merged.getPid().size(), 1); + assertEquals(pub_merged.getExtraInfo().size(), 0); - //verify datainfo + // verify datainfo assertEquals(pub_merged.getDataInfo(), dataInfo); - //verify datepicker + // verify datepicker assertEquals(pub_merged.getDateofacceptance().getValue(), "2018-09-30"); - //verify authors + // verify authors assertEquals(pub_merged.getAuthor().size(), 9); assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4); } - public DataInfo setDI(){ + public DataInfo setDI() { DataInfo dataInfo = new DataInfo(); dataInfo.setTrust("0.9"); dataInfo.setDeletedbyinference(false); @@ -97,13 +99,13 @@ public class EntityMergerTest implements Serializable { return dataInfo; } - public Publication getTopPub(List> publications){ + public Publication getTopPub(List> publications) { Double maxTrust = 0.0; Publication maxPub = new Publication(); for (Tuple2 publication : publications) { Double pubTrust = Double.parseDouble(publication._2().getDataInfo().getTrust()); - if(pubTrust > maxTrust){ + if (pubTrust > maxTrust) { maxTrust = pubTrust; maxPub = publication._2(); } @@ -118,11 +120,11 @@ public class EntityMergerTest implements Serializable { reader = new BufferedReader(new FileReader(path)); String line = reader.readLine(); while (line != null) { - res.add( + res + .add( new Tuple2<>( - MapDocumentUtil.getJPathString("$.id", line), - new ObjectMapper().readValue(line, clazz)) - ); + MapDocumentUtil.getJPathString("$.id", line), + new ObjectMapper().readValue(line, clazz))); // read next line line = reader.readLine(); } @@ -134,5 +136,4 @@ public class EntityMergerTest implements Serializable { return res; } - } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 84b200c076..c4639eb44a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -59,8 +59,10 @@ public abstract class AbstractMdRecordToOafMapper { protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4"; protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; - protected static final Qualifier ORCID_PID_TYPE = qualifier("ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES); - protected static final Qualifier MAG_PID_TYPE = qualifier("MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES); + protected static final Qualifier ORCID_PID_TYPE = qualifier( + "ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES); + protected static final Qualifier MAG_PID_TYPE = qualifier( + "MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES); protected static final Map nsContext = new HashMap<>(); @@ -74,7 +76,8 @@ public abstract class AbstractMdRecordToOafMapper { nsContext.put("datacite", DATACITE_SCHEMA_KERNEL_3); } - protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); + protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier( + "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); protected AbstractMdRecordToOafMapper(final Map code2name) { this.code2name = code2name; @@ -88,15 +91,20 @@ public abstract class AbstractMdRecordToOafMapper { .parseText(xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)); final String type = doc.valueOf("//dr:CobjCategory/@type"); - final KeyValue collectedFrom = getProvenanceDatasource(doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); + final KeyValue collectedFrom = getProvenanceDatasource( + doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); - if (collectedFrom == null) { return null; } + if (collectedFrom == null) { + return null; + } final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) ? collectedFrom : getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name"); - if (hostedBy == null) { return null; } + if (hostedBy == null) { + return null; + } final DataInfo info = prepareDataInfo(doc); final long lastUpdateTimestamp = new Date().getTime(); @@ -111,7 +119,9 @@ public abstract class AbstractMdRecordToOafMapper { final String dsId = doc.valueOf(xpathId); final String dsName = doc.valueOf(xpathName); - if (StringUtils.isBlank(dsId) | StringUtils.isBlank(dsName)) { return null; } + if (StringUtils.isBlank(dsId) | StringUtils.isBlank(dsName)) { + return null; + } return keyValue(createOpenaireId(10, dsId, true), dsName); } @@ -127,47 +137,47 @@ public abstract class AbstractMdRecordToOafMapper { final List oafs = new ArrayList<>(); switch (type.toLowerCase()) { - case "publication": - final Publication p = new Publication(); - populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - p.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE); - p.setJournal(prepareJournal(doc, info)); - oafs.add(p); - break; - case "dataset": - final Dataset d = new Dataset(); - populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - d.setResulttype(DATASET_DEFAULT_RESULTTYPE); - d.setStoragedate(prepareDatasetStorageDate(doc, info)); - d.setDevice(prepareDatasetDevice(doc, info)); - d.setSize(prepareDatasetSize(doc, info)); - d.setVersion(prepareDatasetVersion(doc, info)); - d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); - d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); - d.setGeolocation(prepareDatasetGeoLocations(doc, info)); - oafs.add(d); - break; - case "software": - final Software s = new Software(); - populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - s.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE); - s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); - s.setLicense(prepareSoftwareLicenses(doc, info)); - s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); - s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); - oafs.add(s); - break; - case "": - case "otherresearchproducts": - default: - final OtherResearchProduct o = new OtherResearchProduct(); - populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - o.setResulttype(ORP_DEFAULT_RESULTTYPE); - o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); - o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); - o.setTool(prepareOtherResearchProductTools(doc, info)); - oafs.add(o); - break; + case "publication": + final Publication p = new Publication(); + populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + p.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE); + p.setJournal(prepareJournal(doc, info)); + oafs.add(p); + break; + case "dataset": + final Dataset d = new Dataset(); + populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + d.setResulttype(DATASET_DEFAULT_RESULTTYPE); + d.setStoragedate(prepareDatasetStorageDate(doc, info)); + d.setDevice(prepareDatasetDevice(doc, info)); + d.setSize(prepareDatasetSize(doc, info)); + d.setVersion(prepareDatasetVersion(doc, info)); + d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); + d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); + d.setGeolocation(prepareDatasetGeoLocations(doc, info)); + oafs.add(d); + break; + case "software": + final Software s = new Software(); + populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + s.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE); + s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); + s.setLicense(prepareSoftwareLicenses(doc, info)); + s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); + s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); + oafs.add(s); + break; + case "": + case "otherresearchproducts": + default: + final OtherResearchProduct o = new OtherResearchProduct(); + populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + o.setResulttype(ORP_DEFAULT_RESULTTYPE); + o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); + o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); + o.setTool(prepareOtherResearchProductTools(doc, info)); + oafs.add(o); + break; } if (!oafs.isEmpty()) { @@ -196,9 +206,15 @@ public abstract class AbstractMdRecordToOafMapper { final String projectId = createOpenaireId(40, originalId, true); res - .add(getRelation(docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info, lastUpdateTimestamp)); + .add( + getRelation( + docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info, + lastUpdateTimestamp)); res - .add(getRelation(projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info, lastUpdateTimestamp)); + .add( + getRelation( + projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info, + lastUpdateTimestamp)); } } @@ -244,7 +260,9 @@ public abstract class AbstractMdRecordToOafMapper { r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); r.setCollectedfrom(Arrays.asList(collectedFrom)); r - .setPid(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); + .setPid( + prepareListStructProps( + doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation")); r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES @@ -362,7 +380,9 @@ public abstract class AbstractMdRecordToOafMapper { final String sp = n.valueOf("@sp"); final String vol = n.valueOf("@vol"); final String edition = n.valueOf("@edition"); - if (StringUtils.isNotBlank(name)) { return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); } + if (StringUtils.isNotBlank(name)) { + return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); + } } return null; } @@ -415,7 +435,10 @@ public abstract class AbstractMdRecordToOafMapper { for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; res - .add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n.valueOf("@schemename"), info)); + .add( + structuredProperty( + n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), + n.valueOf("@schemename"), info)); } return res; } @@ -423,7 +446,9 @@ public abstract class AbstractMdRecordToOafMapper { protected OAIProvenance prepareOAIprovenance(final Document doc) { final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); - if (n == null) { return null; } + if (n == null) { + return null; + } final String identifier = n.valueOf("./*[local-name()='identifier']"); final String baseURL = n.valueOf("./*[local-name()='baseURL']"); @@ -438,7 +463,9 @@ public abstract class AbstractMdRecordToOafMapper { protected DataInfo prepareDataInfo(final Document doc) { final Node n = doc.selectSingleNode("//oaf:datainfo"); - if (n == null) { return dataInfo(false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); } + if (n == null) { + return dataInfo(false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); + } final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); @@ -450,7 +477,9 @@ public abstract class AbstractMdRecordToOafMapper { final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); final String trust = n.valueOf("./oaf:trust"); - return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); + return dataInfo( + deletedbyinference, inferenceprovenance, inferred, false, + qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); } protected Field prepareField(final Node node, final String xpath, final DataInfo info) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index 24a8b45270..af9fe7197e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -56,7 +56,8 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { } final String pid = e.valueOf("./@nameIdentifier"); - final String type = e.valueOf("./@nameIdentifierScheme") + final String type = e + .valueOf("./@nameIdentifierScheme") .trim() .toUpperCase() .replaceAll(" ", "") @@ -66,7 +67,9 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { if (StringUtils.isNotBlank(pid)) { if (type.startsWith("ORCID")) { - final String cleanedId = pid.replaceAll("http://orcid.org/", "").replaceAll("https://orcid.org/", ""); + final String cleanedId = pid + .replaceAll("http://orcid.org/", "") + .replaceAll("https://orcid.org/", ""); author.getPid().add(structuredProperty(cleanedId, ORCID_PID_TYPE, info)); } else if (type.startsWith("MAGID")) { author.getPid().add(structuredProperty(pid, MAG_PID_TYPE, info)); @@ -127,7 +130,8 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final Instance instance = new Instance(); instance - .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); + .setInstancetype( + prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); instance.setCollectedfrom(collectedfrom); instance.setHostedby(hostedby); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); @@ -143,13 +147,14 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final List nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier")); instance - .setUrl(nodes - .stream() - .filter(n -> StringUtils.isNotBlank(n.getText())) - .map(n -> n.getText().trim()) - .filter(u -> u.startsWith("http")) - .distinct() - .collect(Collectors.toCollection(ArrayList::new))); + .setUrl( + nodes + .stream() + .filter(n -> StringUtils.isNotBlank(n.getText())) + .map(n -> n.getText().trim()) + .filter(u -> u.startsWith("http")) + .distinct() + .collect(Collectors.toCollection(ArrayList::new))); return Lists.newArrayList(instance); } @@ -274,9 +279,15 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final String otherId = createOpenaireId(50, originalId, false); res - .add(getRelation(docId, otherId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, lastUpdateTimestamp)); + .add( + getRelation( + docId, otherId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, + lastUpdateTimestamp)); res - .add(getRelation(otherId, docId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, lastUpdateTimestamp)); + .add( + getRelation( + otherId, docId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, + lastUpdateTimestamp)); } } return res; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index e7edfb2487..9c74c4a938 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -94,7 +94,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { for (final Object o : n.selectNodes("./datacite:nameIdentifier")) { final String id = ((Node) o).getText(); - final String type = ((Node) o).valueOf("./@nameIdentifierScheme") + final String type = ((Node) o) + .valueOf("./@nameIdentifierScheme") .trim() .toUpperCase() .replaceAll(" ", "") @@ -119,7 +120,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final Instance instance = new Instance(); instance - .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); + .setInstancetype( + prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); instance.setCollectedfrom(collectedfrom); instance.setHostedby(hostedby); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); @@ -168,7 +170,10 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { && !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Available")) { res - .add(structuredProperty(((Node) o).getText(), "UNKNOWN", "UNKNOWN", DNET_DATA_CITE_DATE, DNET_DATA_CITE_DATE, info)); + .add( + structuredProperty( + ((Node) o).getText(), "UNKNOWN", "UNKNOWN", DNET_DATA_CITE_DATE, DNET_DATA_CITE_DATE, + info)); } } return res; @@ -220,14 +225,16 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { protected List> prepareOtherResearchProductContactGroups( final Document doc, final DataInfo info) { - return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info); + return prepareListFields( + doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info); } @Override protected List> prepareOtherResearchProductContactPersons( final Document doc, final DataInfo info) { - return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info); + return prepareListFields( + doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info); } @Override @@ -253,7 +260,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { protected List> prepareSoftwareDocumentationUrls( final Document doc, final DataInfo info) { - return prepareListFields(doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info); + return prepareListFields( + doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info); } // DATASETS @@ -327,16 +335,29 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { if (type.equalsIgnoreCase("IsSupplementTo")) { res - .add(getRelation(docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, collectedFrom, info, lastUpdateTimestamp)); + .add( + getRelation( + docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, collectedFrom, info, + lastUpdateTimestamp)); res - .add(getRelation(otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, collectedFrom, info, lastUpdateTimestamp)); + .add( + getRelation( + otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, collectedFrom, info, + lastUpdateTimestamp)); } else if (type.equals("IsPartOf")) { res - .add(getRelation(docId, otherId, RESULT_RESULT, PART, IS_PART_OF, collectedFrom, info, lastUpdateTimestamp)); + .add( + getRelation( + docId, otherId, RESULT_RESULT, PART, IS_PART_OF, collectedFrom, info, + lastUpdateTimestamp)); res - .add(getRelation(otherId, docId, RESULT_RESULT, PART, HAS_PARTS, collectedFrom, info, lastUpdateTimestamp)); - } else {} + .add( + getRelation( + otherId, docId, RESULT_RESULT, PART, HAS_PARTS, collectedFrom, info, + lastUpdateTimestamp)); + } else { + } } } return res; @@ -344,6 +365,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @Override protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { - return prepareQualifier(doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE, DNET_DATA_CITE_RESOURCE); + return prepareQualifier( + doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE, + DNET_DATA_CITE_RESOURCE); } } From 4308f311650bca96eab1ad5d3e09c87f56d97901 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 22 May 2020 13:13:01 +0200 Subject: [PATCH 17/17] added fix to make test run --- .../ResultToOrganizationJobTest.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultToOrganizationJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultToOrganizationJobTest.java index 435b766052..cfcccc5f0d 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultToOrganizationJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultToOrganizationJobTest.java @@ -99,6 +99,7 @@ public class ResultToOrganizationJobTest { .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); Assertions.assertEquals(0, tmp.count()); + FileUtils.deleteDirectory(workingDir.toFile()); } /** @@ -171,6 +172,7 @@ public class ResultToOrganizationJobTest { + "(target = '20|opendoar____::124266ebc4ece2934eb80edfda3f2091' " + "or target = '20|dedup_wf_001::5168917a6aeeea55269daeac1af2ecd2')") .count()); + FileUtils.deleteDirectory(workingDir.toFile()); } @Test @@ -266,5 +268,6 @@ public class ResultToOrganizationJobTest { "relclass = 'isAuthorInstitutionOf' and " + "substring(source, 1,2) = '20' and substring(target, 1, 2) = '50'") .count()); + FileUtils.deleteDirectory(workingDir.toFile()); } }