diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java index 29f6cbe3a..0716bd98d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java @@ -20,7 +20,7 @@ public enum Topic { // ENRICHMENT MORE ENRICH_MORE_PID("ENRICH/MORE/PID"), ENRICH_MORE_OA_VERSION("ENRICH/MORE/OPENACCESS_VERSION"), ENRICH_MORE_ABSTRACT( "ENRICH/MORE/ABSTRACT"), ENRICH_MORE_PUBLICATION_DATE("ENRICH/MORE/PUBLICATION_DATE"), ENRICH_MORE_PROJECT( - "ENRICH/MORE/PROJECT"), ENRICH_MORE_SUBJECT_MESHEUROPMC( + "ENRICH/MORE/PROJECT"), ENRICH_MORE_SOFTWARE("ENRICH/MORE/SOFTWARE"), ENRICH_MORE_SUBJECT_MESHEUROPMC( "ENRICH/MORE/SUBJECT/MESHEUROPMC"), ENRICH_MORE_SUBJECT_ARXIV( "ENRICH/MORE/SUBJECT/ARXIV"), ENRICH_MORE_SUBJECT_JEL( "ENRICH/MORE/SUBJECT/JEL"), ENRICH_MORE_SUBJECT_DDC( @@ -28,7 +28,21 @@ public enum Topic { "ENRICH/MORE/SUBJECT/ACM"), ENRICH_MORE_SUBJECT_RVK("ENRICH/MORE/SUBJECT/RVK"), // ADDITION - ADD_BY_PROJECT("ADD/BY_PROJECT"); + ADD_BY_PROJECT("ADD/BY_PROJECT"), + + // OTHER RELS + ENRICH_MISSING_PUBLICATION_IS_RELATED_TO( + "ENRICH/MISSING/PUBLICATION/IS_RELATED_TO"), ENRICH_MISSING_PUBLICATION_REFERENCES( + "ENRICH/MISSING/PUBLICATION/REFERENCES"), ENRICH_MISSING_PUBLICATION_IS_REFERENCED_BY( + "ENRICH/MISSING/PUBLICATION/IS_REFERENCED_BY"), ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_TO( + "ENRICH/MISSING/PUBLICATION/IS_SUPPLEMENTED_TO"), ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_BY( + "ENRICH/MISSING/PUBLICATION/IS_SUPPLEMENTED_BY"), + + ENRICH_MISSING_DATASET_IS_RELATED_TO("ENRICH/MISSING/DATASET/IS_RELATED_TO"), ENRICH_MISSING_DATASET_REFERENCES( + "ENRICH/MISSING/DATASET/REFERENCES"), ENRICH_MISSING_DATASET_IS_REFERENCED_BY( + "ENRICH/MISSING/DATASET/IS_REFERENCED_BY"), ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_TO( + "ENRICH/MISSING/DATASET/IS_SUPPLEMENTED_TO"), ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_BY( + "ENRICH/MISSING/DATASET/IS_SUPPLEMENTED_BY"),; Topic(final String path) { this.path = path; diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java index 43ebd6dd8..fa425a181 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java @@ -4,48 +4,101 @@ package eu.dnetlib.dhp.broker.oa; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.util.ArrayList; +import java.util.Collection; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.broker.model.Event; import eu.dnetlib.dhp.broker.model.EventFactory; import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingAbstract; import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingAuthorOrcid; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingDatasetIsReferencedBy; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingDatasetIsRelatedTo; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingDatasetIsSupplementedBy; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingDatasetIsSupplementedTo; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingDatasetReferences; import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingOpenAccess; import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPid; import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingProject; import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationDate; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationIsReferencedBy; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationIsRelatedTo; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationIsSupplementedBy; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationIsSupplementedTo; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationReferences; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingSoftware; import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingSubject; import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreOpenAccess; import eu.dnetlib.dhp.broker.oa.matchers.EnrichMorePid; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreProject; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreSoftware; import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreSubject; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.Software; public class GenerateEventsApplication { private static final Logger log = LoggerFactory.getLogger(GenerateEventsApplication.class); - private static final UpdateMatcher enrichMissingAbstract = new EnrichMissingAbstract(); - private static final UpdateMatcher enrichMissingAuthorOrcid = new EnrichMissingAuthorOrcid(); - private static final UpdateMatcher enrichMissingOpenAccess = new EnrichMissingOpenAccess(); - private static final UpdateMatcher enrichMissingPid = new EnrichMissingPid(); - private static final UpdateMatcher enrichMissingProject = new EnrichMissingProject(); - private static final UpdateMatcher enrichMissingPublicationDate = new EnrichMissingPublicationDate(); - private static final UpdateMatcher enrichMissingSubject = new EnrichMissingSubject(); - private static final UpdateMatcher enrichMoreOpenAccess = new EnrichMoreOpenAccess(); - private static final UpdateMatcher enrichMorePid = new EnrichMorePid(); - private static final UpdateMatcher enrichMoreSubject = new EnrichMoreSubject(); + // Simple Matchers + private static final UpdateMatcher enrichMissingAbstract = new EnrichMissingAbstract(); + private static final UpdateMatcher enrichMissingAuthorOrcid = new EnrichMissingAuthorOrcid(); + private static final UpdateMatcher enrichMissingOpenAccess = new EnrichMissingOpenAccess(); + private static final UpdateMatcher enrichMissingPid = new EnrichMissingPid(); + private static final UpdateMatcher enrichMissingPublicationDate = new EnrichMissingPublicationDate(); + private static final UpdateMatcher enrichMissingSubject = new EnrichMissingSubject(); + private static final UpdateMatcher enrichMoreOpenAccess = new EnrichMoreOpenAccess(); + private static final UpdateMatcher enrichMorePid = new EnrichMorePid(); + private static final UpdateMatcher enrichMoreSubject = new EnrichMoreSubject(); + + // Advanced matchers + private static final UpdateMatcher>, ?> enrichMissingProject = new EnrichMissingProject(); + private static final UpdateMatcher>, ?> enrichMoreProject = new EnrichMoreProject(); + + private static final UpdateMatcher>, ?> enrichMissingSoftware = new EnrichMissingSoftware(); + private static final UpdateMatcher>, ?> enrichMoreSoftware = new EnrichMoreSoftware(); + + private static final UpdateMatcher>, ?> enrichMisissingPublicationIsRelatedTo = new EnrichMissingPublicationIsRelatedTo(); + private static final UpdateMatcher>, ?> enrichMissingPublicationIsReferencedBy = new EnrichMissingPublicationIsReferencedBy(); + private static final UpdateMatcher>, ?> enrichMissingPublicationReferences = new EnrichMissingPublicationReferences(); + private static final UpdateMatcher>, ?> enrichMissingPublicationIsSupplementedTo = new EnrichMissingPublicationIsSupplementedTo(); + private static final UpdateMatcher>, ?> enrichMissingPublicationIsSupplementedBy = new EnrichMissingPublicationIsSupplementedBy(); + + private static final UpdateMatcher>, ?> enrichMisissingDatasetIsRelatedTo = new EnrichMissingDatasetIsRelatedTo(); + private static final UpdateMatcher>, ?> enrichMissingDatasetIsReferencedBy = new EnrichMissingDatasetIsReferencedBy(); + private static final UpdateMatcher>, ?> enrichMissingDatasetReferences = new EnrichMissingDatasetReferences(); + private static final UpdateMatcher>, ?> enrichMissingDatasetIsSupplementedTo = new EnrichMissingDatasetIsSupplementedTo(); + private static final UpdateMatcher>, ?> enrichMissingDatasetIsSupplementedBy = new EnrichMissingDatasetIsSupplementedBy(); + + public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( @@ -68,9 +121,19 @@ public class GenerateEventsApplication { log.info("eventsPath: {}", eventsPath); final SparkConf conf = new SparkConf(); + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); removeOutputDir(spark, eventsPath); - generateEvents(spark, graphPath, eventsPath); + + final JavaRDD eventsRdd = sc.emptyRDD(); + + eventsRdd.union(generateSimpleEvents(spark, graphPath, Publication.class)); + eventsRdd.union(generateSimpleEvents(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class)); + eventsRdd.union(generateSimpleEvents(spark, graphPath, Software.class)); + eventsRdd.union(generateSimpleEvents(spark, graphPath, OtherResearchProduct.class)); + + eventsRdd.saveAsTextFile(eventsPath, GzipCodec.class); }); } @@ -79,11 +142,34 @@ public class GenerateEventsApplication { HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); } - private static void generateEvents(final SparkSession spark, final String graphPath, final String eventsPath) { - // TODO + private static JavaRDD generateSimpleEvents(final SparkSession spark, + final String graphPath, + final Class resultClazz) { + + final Dataset results = readPath( + spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), resultClazz) + .filter(r -> r.getDataInfo().getDeletedbyinference()); + + final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class) + .filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)); + + final Column c = null; // TODO + + final Dataset aa = results + .joinWith(rels, results.col("id").equalTo(rels.col("source")), "inner") + .groupBy(rels.col("target")) + .agg(c) + .filter(x -> x.size() > 1) + // generateSimpleEvents(...) + // flatMap() + // toRdd() + ; + + return null; + } - private List generateEvents(final Result... children) { + private List generateSimpleEvents(final Collection children) { final List> list = new ArrayList<>(); for (final Result target : children) { @@ -91,7 +177,6 @@ public class GenerateEventsApplication { list.addAll(enrichMissingAuthorOrcid.searchUpdatesForRecord(target, children)); list.addAll(enrichMissingOpenAccess.searchUpdatesForRecord(target, children)); list.addAll(enrichMissingPid.searchUpdatesForRecord(target, children)); - list.addAll(enrichMissingProject.searchUpdatesForRecord(target, children)); list.addAll(enrichMissingPublicationDate.searchUpdatesForRecord(target, children)); list.addAll(enrichMissingSubject.searchUpdatesForRecord(target, children)); list.addAll(enrichMoreOpenAccess.searchUpdatesForRecord(target, children)); @@ -102,4 +187,94 @@ public class GenerateEventsApplication { return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList()); } + private List generateProjectsEvents(final Collection>> childrenWithProjects) { + final List> list = new ArrayList<>(); + + for (final Pair> target : childrenWithProjects) { + list.addAll(enrichMissingProject.searchUpdatesForRecord(target, childrenWithProjects)); + list.addAll(enrichMoreProject.searchUpdatesForRecord(target, childrenWithProjects)); + } + + return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList()); + } + + private List generateSoftwareEvents(final Collection>> childrenWithSoftwares) { + final List> list = new ArrayList<>(); + + for (final Pair> target : childrenWithSoftwares) { + list.addAll(enrichMissingSoftware.searchUpdatesForRecord(target, childrenWithSoftwares)); + list.addAll(enrichMoreSoftware.searchUpdatesForRecord(target, childrenWithSoftwares)); + } + return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList()); + } + + private List generatePublicationRelatedEvents(final String relType, + final Collection>>> childrenWithRels) { + + final List> list = new ArrayList<>(); + + final List>> cleanedChildrens = childrenWithRels + .stream() + .filter(p -> p.getRight().containsKey(relType)) + .map(p -> Pair.of(p.getLeft(), p.getRight().get(relType))) + .filter(p -> p.getRight().size() > 0) + .collect(Collectors.toList()); + + for (final Pair> target : cleanedChildrens) { + if (relType.equals("isRelatedTo")) { + list.addAll(enrichMisissingPublicationIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens)); + } else if (relType.equals("references")) { + list.addAll(enrichMissingPublicationReferences.searchUpdatesForRecord(target, cleanedChildrens)); + } else if (relType.equals("isReferencedBy")) { + list.addAll(enrichMissingPublicationIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens)); + } else if (relType.equals("isSupplementedTo")) { + list.addAll(enrichMissingPublicationIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens)); + } else if (relType.equals("isSupplementedBy")) { + list.addAll(enrichMissingPublicationIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens)); + } + } + + return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList()); + + } + + private List generateDatasetRelatedEvents(final String relType, + final Collection>>> childrenWithRels) { + + final List> list = new ArrayList<>(); + + final List>> cleanedChildrens = childrenWithRels + .stream() + .filter(p -> p.getRight().containsKey(relType)) + .map(p -> Pair.of(p.getLeft(), p.getRight().get(relType))) + .filter(p -> p.getRight().size() > 0) + .collect(Collectors.toList()); + + for (final Pair> target : cleanedChildrens) { + if (relType.equals("isRelatedTo")) { + list.addAll(enrichMisissingDatasetIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens)); + } else if (relType.equals("references")) { + list.addAll(enrichMissingDatasetReferences.searchUpdatesForRecord(target, cleanedChildrens)); + } else if (relType.equals("isReferencedBy")) { + list.addAll(enrichMissingDatasetIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens)); + } else if (relType.equals("isSupplementedTo")) { + list.addAll(enrichMissingDatasetIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens)); + } else if (relType.equals("isSupplementedBy")) { + list.addAll(enrichMissingDatasetIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens)); + } + } + + return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList()); + + } + + public static Dataset readPath( + final SparkSession spark, + final String inputPath, + final Class clazz) { + return spark + .read() + .textFile(inputPath) + .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); + } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAbstract.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAbstract.java index 43cf738f8..6dab6355f 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAbstract.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAbstract.java @@ -9,7 +9,7 @@ import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingAbstract extends UpdateMatcher { +public class EnrichMissingAbstract extends UpdateMatcher { public EnrichMissingAbstract() { super(false); @@ -24,7 +24,8 @@ public class EnrichMissingAbstract extends UpdateMatcher { } @Override - public UpdateInfo generateUpdateInfo(final String highlightValue, final Result source, + public UpdateInfo generateUpdateInfo(final String highlightValue, + final Result source, final Result target) { return new UpdateInfo<>( Topic.ENRICH_MISSING_ABSTRACT, diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAuthorOrcid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAuthorOrcid.java index beeccdbe8..c7146ad79 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAuthorOrcid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAuthorOrcid.java @@ -10,7 +10,7 @@ import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingAuthorOrcid extends UpdateMatcher> { +public class EnrichMissingAuthorOrcid extends UpdateMatcher> { public EnrichMissingAuthorOrcid() { super(true); @@ -24,7 +24,8 @@ public class EnrichMissingAuthorOrcid extends UpdateMatcher @Override public UpdateInfo> generateUpdateInfo(final Pair highlightValue, - final Result source, final Result target) { + final Result source, + final Result target) { return new UpdateInfo<>( Topic.ENRICH_MISSING_AUTHOR_ORCID, highlightValue, source, target, diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsReferencedBy.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsReferencedBy.java new file mode 100644 index 000000000..3b9326fef --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsReferencedBy.java @@ -0,0 +1,38 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingDatasetIsReferencedBy + extends UpdateMatcher>, eu.dnetlib.broker.objects.Dataset> { + + public EnrichMissingDatasetIsReferencedBy() { + super(true); + } + + @Override + protected List> findUpdates(final Pair> source, + final Pair> target) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Dataset highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_DATASET_IS_REFERENCED_BY, + highlightValue, source.getLeft(), target.getLeft(), + (p, rel) -> p.getDatasets().add(rel), + rel -> rel.getInstances().get(0).getUrl()); + } +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsRelatedTo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsRelatedTo.java new file mode 100644 index 000000000..35f7c52b4 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsRelatedTo.java @@ -0,0 +1,38 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingDatasetIsRelatedTo + extends UpdateMatcher>, eu.dnetlib.broker.objects.Dataset> { + + public EnrichMissingDatasetIsRelatedTo() { + super(true); + } + + @Override + protected List> findUpdates(final Pair> source, + final Pair> target) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Dataset highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_DATASET_IS_RELATED_TO, + highlightValue, source.getLeft(), target.getLeft(), + (p, rel) -> p.getDatasets().add(rel), + rel -> rel.getInstances().get(0).getUrl()); + } +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsSupplementedBy.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsSupplementedBy.java new file mode 100644 index 000000000..1faa305b5 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsSupplementedBy.java @@ -0,0 +1,38 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingDatasetIsSupplementedBy + extends UpdateMatcher>, eu.dnetlib.broker.objects.Dataset> { + + public EnrichMissingDatasetIsSupplementedBy() { + super(true); + } + + @Override + protected List> findUpdates(final Pair> source, + final Pair> target) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Dataset highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_BY, + highlightValue, source.getLeft(), target.getLeft(), + (p, rel) -> p.getDatasets().add(rel), + rel -> rel.getInstances().get(0).getUrl()); + } +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsSupplementedTo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsSupplementedTo.java new file mode 100644 index 000000000..d1b067272 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetIsSupplementedTo.java @@ -0,0 +1,38 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingDatasetIsSupplementedTo + extends UpdateMatcher>, eu.dnetlib.broker.objects.Dataset> { + + public EnrichMissingDatasetIsSupplementedTo() { + super(true); + } + + @Override + protected List> findUpdates(final Pair> source, + final Pair> target) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Dataset highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_TO, + highlightValue, source.getLeft(), target.getLeft(), + (p, rel) -> p.getDatasets().add(rel), + rel -> rel.getInstances().get(0).getUrl()); + } +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetReferences.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetReferences.java new file mode 100644 index 000000000..ce6adeba2 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingDatasetReferences.java @@ -0,0 +1,38 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingDatasetReferences + extends UpdateMatcher>, eu.dnetlib.broker.objects.Dataset> { + + public EnrichMissingDatasetReferences() { + super(true); + } + + @Override + protected List> findUpdates(final Pair> source, + final Pair> target) { + // TODO Auto-generated method stub + return null; + } + + @Override + protected UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Dataset highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_DATASET_REFERENCES, + highlightValue, source.getLeft(), target.getLeft(), + (p, rel) -> p.getDatasets().add(rel), + rel -> rel.getInstances().get(0).getUrl()); + } +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingOpenAccess.java index a4a2ea0c6..81263c6c3 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingOpenAccess.java @@ -12,7 +12,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingOpenAccess extends UpdateMatcher { +public class EnrichMissingOpenAccess extends UpdateMatcher { public EnrichMissingOpenAccess() { super(true); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPid.java index a8df62541..5f10bb4d9 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPid.java @@ -11,7 +11,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingPid extends UpdateMatcher { +public class EnrichMissingPid extends UpdateMatcher { public EnrichMissingPid() { super(true); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingProject.java index b6e5b3b57..0197c99b1 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingProject.java @@ -4,30 +4,35 @@ package eu.dnetlib.dhp.broker.oa.matchers; import java.util.Arrays; import java.util.List; -import eu.dnetlib.broker.objects.Project; +import org.apache.commons.lang3.tuple.Pair; + import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingProject extends UpdateMatcher { +public class EnrichMissingProject + extends UpdateMatcher>, eu.dnetlib.broker.objects.Project> { public EnrichMissingProject() { super(true); } @Override - protected List> findUpdates(final Result source, final Result target) { - // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); + protected List> findUpdates(final Pair> source, + final Pair> target) { + // TODO return Arrays.asList(); } @Override - public UpdateInfo generateUpdateInfo(final Project highlightValue, - final Result source, - final Result target) { + public UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Project highlightValue, + final Pair> source, + final Pair> target) { return new UpdateInfo<>( Topic.ENRICH_MISSING_PROJECT, - highlightValue, source, target, + highlightValue, source.getLeft(), target.getLeft(), (p, prj) -> p.getProjects().add(prj), prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode()); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java index e9ec082c4..19ef2bab7 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.broker.oa.matchers; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -8,7 +9,7 @@ import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMissingPublicationDate extends UpdateMatcher { +public class EnrichMissingPublicationDate extends UpdateMatcher { public EnrichMissingPublicationDate() { super(false); @@ -16,12 +17,15 @@ public class EnrichMissingPublicationDate extends UpdateMatcher { @Override protected List> findUpdates(final Result source, final Result target) { - // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); - return Arrays.asList(); + if (isMissing(target.getDateofacceptance()) && !isMissing(source.getDateofacceptance())) { + return Arrays.asList(generateUpdateInfo(source.getDateofacceptance().getValue(), source, target)); + } + return new ArrayList<>(); } @Override - public UpdateInfo generateUpdateInfo(final String highlightValue, final Result source, + public UpdateInfo generateUpdateInfo(final String highlightValue, + final Result source, final Result target) { return new UpdateInfo<>( Topic.ENRICH_MISSING_PUBLICATION_DATE, diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsReferencedBy.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsReferencedBy.java new file mode 100644 index 000000000..8bcee5a1f --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsReferencedBy.java @@ -0,0 +1,42 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingPublicationIsReferencedBy + extends UpdateMatcher>, eu.dnetlib.broker.objects.Publication> { + + public EnrichMissingPublicationIsReferencedBy() { + super(true); + } + + @Override + protected List> findUpdates( + final Pair> source, + final Pair> target) { + // TODO Auto-generated method stub + return Arrays.asList(); + } + + @Override + protected UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Publication highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_PUBLICATION_IS_REFERENCED_BY, + highlightValue, source.getLeft(), target.getLeft(), + (p, rel) -> { + }, // p.getPublications().add(rel), //TODO available in the future release of dnet-openaire-broker-common + rel -> rel.getOriginalId()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsRelatedTo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsRelatedTo.java new file mode 100644 index 000000000..0c16f9f56 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsRelatedTo.java @@ -0,0 +1,42 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingPublicationIsRelatedTo + extends UpdateMatcher>, eu.dnetlib.broker.objects.Publication> { + + public EnrichMissingPublicationIsRelatedTo() { + super(true); + } + + @Override + protected List> findUpdates( + final Pair> source, + final Pair> target) { + // TODO Auto-generated method stub + return Arrays.asList(); + } + + @Override + protected UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Publication highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_PUBLICATION_IS_RELATED_TO, + highlightValue, source.getLeft(), target.getLeft(), + (p, rel) -> { + }, // p.getPublications().add(rel), //TODO available in the future release of dnet-openaire-broker-common + rel -> rel.getOriginalId()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsSupplementedBy.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsSupplementedBy.java new file mode 100644 index 000000000..0b3c33270 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsSupplementedBy.java @@ -0,0 +1,42 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingPublicationIsSupplementedBy + extends UpdateMatcher>, eu.dnetlib.broker.objects.Publication> { + + public EnrichMissingPublicationIsSupplementedBy() { + super(true); + } + + @Override + protected List> findUpdates( + final Pair> source, + final Pair> target) { + // TODO Auto-generated method stub + return Arrays.asList(); + } + + @Override + protected UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Publication highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_BY, + highlightValue, source.getLeft(), target.getLeft(), + (p, rel) -> { + }, // p.getPublications().add(rel), //TODO available in the future release of dnet-openaire-broker-common + rel -> rel.getOriginalId()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsSupplementedTo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsSupplementedTo.java new file mode 100644 index 000000000..0e72a8423 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationIsSupplementedTo.java @@ -0,0 +1,42 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingPublicationIsSupplementedTo + extends UpdateMatcher>, eu.dnetlib.broker.objects.Publication> { + + public EnrichMissingPublicationIsSupplementedTo() { + super(true); + } + + @Override + protected List> findUpdates( + final Pair> source, + final Pair> target) { + // TODO Auto-generated method stub + return Arrays.asList(); + } + + @Override + protected UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Publication highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_TO, + highlightValue, source.getLeft(), target.getLeft(), + (p, rel) -> { + }, // p.getPublications().add(rel), //TODO available in the future release of dnet-openaire-broker-common + rel -> rel.getOriginalId()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationReferences.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationReferences.java new file mode 100644 index 000000000..6d1124974 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationReferences.java @@ -0,0 +1,42 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingPublicationReferences + extends UpdateMatcher>, eu.dnetlib.broker.objects.Publication> { + + public EnrichMissingPublicationReferences() { + super(true); + } + + @Override + protected List> findUpdates( + final Pair> source, + final Pair> target) { + // TODO Auto-generated method stub + return Arrays.asList(); + } + + @Override + protected UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Publication highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_PUBLICATION_REFERENCES, + highlightValue, source.getLeft(), target.getLeft(), + (p, rel) -> { + }, // p.getPublications().add(rel), //TODO available in the future release of dnet-openaire-broker-common + rel -> rel.getOriginalId()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSoftware.java new file mode 100644 index 000000000..954ee48be --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSoftware.java @@ -0,0 +1,41 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.Software; + +public class EnrichMissingSoftware + extends UpdateMatcher>, eu.dnetlib.broker.objects.Software> { + + public EnrichMissingSoftware() { + super(true); + } + + @Override + protected List> findUpdates( + final Pair> source, + final Pair> target) { + // TODO + return Arrays.asList(); + } + + @Override + public UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Software highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_SOFTWARE, + highlightValue, source.getLeft(), target.getLeft(), + (p, s) -> p.getSoftwares().add(s), + s -> s.getName()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSubject.java index 79e9d469b..a7c72f6ea 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSubject.java @@ -14,7 +14,7 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -public class EnrichMissingSubject extends UpdateMatcher> { +public class EnrichMissingSubject extends UpdateMatcher> { public EnrichMissingSubject() { super(true); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreOpenAccess.java index 40c9b0500..2cd2775c9 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreOpenAccess.java @@ -12,7 +12,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMoreOpenAccess extends UpdateMatcher { +public class EnrichMoreOpenAccess extends UpdateMatcher { public EnrichMoreOpenAccess() { super(true); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMorePid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMorePid.java index 0e7b7766a..048d19747 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMorePid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMorePid.java @@ -11,7 +11,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMorePid extends UpdateMatcher { +public class EnrichMorePid extends UpdateMatcher { public EnrichMorePid() { super(true); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreProject.java new file mode 100644 index 000000000..4bf45d943 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreProject.java @@ -0,0 +1,39 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMoreProject extends UpdateMatcher>, eu.dnetlib.broker.objects.Project> { + + public EnrichMoreProject() { + super(true); + } + + @Override + protected List> findUpdates(final Pair> source, + final Pair> target) { + // TODO + return Arrays.asList(); + } + + @Override + public UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Project highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MORE_PROJECT, + highlightValue, source.getLeft(), target.getLeft(), + (p, prj) -> p.getProjects().add(prj), + prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSoftware.java new file mode 100644 index 000000000..9760504f6 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSoftware.java @@ -0,0 +1,41 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.Software; + +public class EnrichMoreSoftware + extends UpdateMatcher>, eu.dnetlib.broker.objects.Software> { + + public EnrichMoreSoftware() { + super(true); + } + + @Override + protected List> findUpdates( + final Pair> source, + final Pair> target) { + // TODO + return Arrays.asList(); + } + + @Override + public UpdateInfo generateUpdateInfo( + final eu.dnetlib.broker.objects.Software highlightValue, + final Pair> source, + final Pair> target) { + return new UpdateInfo<>( + Topic.ENRICH_MORE_SOFTWARE, + highlightValue, source.getLeft(), target.getLeft(), + (p, s) -> p.getSoftwares().add(s), + s -> s.getName()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSubject.java index e6374479b..67c2f0116 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSubject.java @@ -12,7 +12,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; -public class EnrichMoreSubject extends UpdateMatcher> { +public class EnrichMoreSubject extends UpdateMatcher> { public EnrichMoreSubject() { super(true); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java index b8b6132cd..5bfe108a5 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java @@ -12,9 +12,8 @@ import org.apache.commons.lang3.StringUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Field; -import eu.dnetlib.dhp.schema.oaf.Result; -public abstract class UpdateMatcher { +public abstract class UpdateMatcher { private final boolean multipleUpdate; @@ -22,11 +21,11 @@ public abstract class UpdateMatcher { this.multipleUpdate = multipleUpdate; } - public Collection> searchUpdatesForRecord(final Result res, final Result... others) { + public Collection> searchUpdatesForRecord(final K res, final Collection others) { final Map> infoMap = new HashMap<>(); - for (final Result source : others) { + for (final K source : others) { if (source != res) { for (final UpdateInfo info : findUpdates(source, res)) { final String s = DigestUtils.md5Hex(info.getHighlightValueAsString()); @@ -52,13 +51,18 @@ public abstract class UpdateMatcher { } } - protected abstract List> findUpdates(Result source, Result target); + protected abstract List> findUpdates(K source, K target); - protected abstract UpdateInfo generateUpdateInfo(final T highlightValue, final Result source, - final Result target); + protected abstract UpdateInfo generateUpdateInfo(final T highlightValue, + final K source, + final K target); protected static boolean isMissing(final List> list) { return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0).getValue()); } + protected boolean isMissing(final Field field) { + return field == null || StringUtils.isBlank(field.getValue()); + } + } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java index d61d5bfb7..8e97192bf 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java @@ -4,4 +4,6 @@ package eu.dnetlib.dhp.broker.oa.util; public class BrokerConstants { public final static String OPEN_ACCESS = "OPEN"; + public final static String IS_MERGED_IN_CLASS = "isMergedIn"; + } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java new file mode 100644 index 000000000..43df19f8a --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java @@ -0,0 +1,164 @@ + +package eu.dnetlib.dhp.oa.dedup; + +import java.text.Normalizer; +import java.util.*; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; + +import com.wcohen.ss.JaroWinkler; + +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.pace.model.Person; +import scala.Tuple2; + +public class AuthorMerger { + + private static final Double THRESHOLD = 0.95; + + public static List merge(List> authors) { + + authors.sort(new Comparator>() { + @Override + public int compare(List o1, List o2) { + return -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)); + } + }); + + List author = new ArrayList<>(); + + for (List a : authors) { + author = mergeAuthor(author, a); + } + + return author; + + } + + public static List mergeAuthor(final List a, final List b) { + int pa = countAuthorsPids(a); + int pb = countAuthorsPids(b); + List base, enrich; + int sa = authorsSize(a); + int sb = authorsSize(b); + + if (pa == pb) { + base = sa > sb ? a : b; + enrich = sa > sb ? b : a; + } else { + base = pa > pb ? a : b; + enrich = pa > pb ? b : a; + } + enrichPidFromList(base, enrich); + return base; + } + + private static void enrichPidFromList(List base, List enrich) { + if (base == null || enrich == null) + return; + final Map basePidAuthorMap = base + .stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap( + a -> a + .getPid() + .stream() + .map(p -> new Tuple2<>(pidToComparableString(p), a))) + .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); + + final List> pidToEnrich = enrich + .stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap( + a -> a + .getPid() + .stream() + .filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p))) + .map(p -> new Tuple2<>(p, a))) + .collect(Collectors.toList()); + + pidToEnrich + .forEach( + a -> { + Optional> simAuthor = base + .stream() + .map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) + .max(Comparator.comparing(Tuple2::_1)); + if (simAuthor.isPresent() && simAuthor.get()._1() > THRESHOLD) { + Author r = simAuthor.get()._2(); + if (r.getPid() == null) { + r.setPid(new ArrayList<>()); + } + r.getPid().add(a._1()); + } + }); + } + + public static String pidToComparableString(StructuredProperty pid) { + return (pid.getQualifier() != null + ? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : "" + : "") + (pid.getValue() != null ? pid.getValue().toLowerCase() : ""); + } + + public static int countAuthorsPids(List authors) { + if (authors == null) + return 0; + + return (int) authors.stream().filter(AuthorMerger::hasPid).count(); + } + + private static int authorsSize(List authors) { + if (authors == null) + return 0; + return authors.size(); + } + + private static Double sim(Author a, Author b) { + + final Person pa = parse(a); + final Person pb = parse(b); + + if (pa.isAccurate() & pb.isAccurate()) { + return new JaroWinkler() + .score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())); + } else { + return new JaroWinkler() + .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname())); + } + } + + private static boolean hasPid(Author a) { + if (a == null || a.getPid() == null || a.getPid().size() == 0) + return false; + return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); + } + + private static Person parse(Author author) { + if (StringUtils.isNotBlank(author.getSurname())) { + return new Person(author.getSurname() + ", " + author.getName(), false); + } else { + return new Person(author.getFullname(), false); + } + } + + private static String normalize(final String s) { + return nfd(s) + .toLowerCase() + // do not compact the regexes in a single expression, would cause StackOverflowError + // in case + // of large input strings + .replaceAll("(\\W)+", " ") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim(); + } + + private static String nfd(final String s) { + return Normalizer.normalize(s, Normalizer.Form.NFD); + } + +} diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java index fa06424d7..8028d5a94 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java @@ -1,8 +1,10 @@ package eu.dnetlib.dhp.oa.dedup; +import java.io.Serializable; import java.util.Collection; import java.util.Iterator; +import java.util.List; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapGroupsFunction; @@ -67,16 +69,19 @@ public class DedupRecordFactory { (MapFunction, String>) entity -> entity._1(), Encoders.STRING()) .mapGroups( (MapGroupsFunction, T>) (key, - values) -> entityMerger(key, values, ts, dataInfo), + values) -> entityMerger(key, values, ts, dataInfo, clazz), Encoders.bean(clazz)); } - private static T entityMerger( - String id, Iterator> entities, long ts, DataInfo dataInfo) { + public static T entityMerger( + String id, Iterator> entities, long ts, DataInfo dataInfo, Class clazz) + throws IllegalAccessException, InstantiationException { - T entity = entities.next()._2(); + T entity = clazz.newInstance(); final Collection dates = Lists.newArrayList(); + final List> authors = Lists.newArrayList(); + entities .forEachRemaining( t -> { @@ -84,17 +89,17 @@ public class DedupRecordFactory { entity.mergeFrom(duplicate); if (ModelSupport.isSubClass(duplicate, Result.class)) { Result r1 = (Result) duplicate; - Result er = (Result) entity; - er.setAuthor(DedupUtility.mergeAuthor(er.getAuthor(), r1.getAuthor())); - - if (r1.getDateofacceptance() != null) { + if (r1.getAuthor() != null && r1.getAuthor().size() > 0) + authors.add(r1.getAuthor()); + if (r1.getDateofacceptance() != null) dates.add(r1.getDateofacceptance().getValue()); - } } }); + // set authors and date if (ModelSupport.isSubClass(entity, Result.class)) { ((Result) entity).setDateofacceptance(DatePicker.pick(dates)); + ((Result) entity).setAuthor(AuthorMerger.merge(authors)); } entity.setId(id); diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java index a6ec364c4..222794d64 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java @@ -32,7 +32,6 @@ import eu.dnetlib.pace.model.Person; import scala.Tuple2; public class DedupUtility { - private static final Double THRESHOLD = 0.95; public static Map constructAccumulator( final DedupConfig dedupConf, final SparkContext context) { @@ -82,61 +81,6 @@ public class DedupUtility { } } - public static List mergeAuthor(final List a, final List b) { - int pa = countAuthorsPids(a); - int pb = countAuthorsPids(b); - List base, enrich; - int sa = authorsSize(a); - int sb = authorsSize(b); - - if (pa == pb) { - base = sa > sb ? a : b; - enrich = sa > sb ? b : a; - } else { - base = pa > pb ? a : b; - enrich = pa > pb ? b : a; - } - enrichPidFromList(base, enrich); - return base; - } - - private static void enrichPidFromList(List base, List enrich) { - if (base == null || enrich == null) - return; - final Map basePidAuthorMap = base - .stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap(a -> a.getPid().stream().map(p -> new Tuple2<>(p.toComparableString(), a))) - .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); - - final List> pidToEnrich = enrich - .stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap( - a -> a - .getPid() - .stream() - .filter(p -> !basePidAuthorMap.containsKey(p.toComparableString())) - .map(p -> new Tuple2<>(p, a))) - .collect(Collectors.toList()); - - pidToEnrich - .forEach( - a -> { - Optional> simAuhtor = base - .stream() - .map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) - .max(Comparator.comparing(Tuple2::_1)); - if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) { - Author r = simAuhtor.get()._2(); - if (r.getPid() == null) { - r.setPid(new ArrayList<>()); - } - r.getPid().add(a._1()); - } - }); - } - public static String createDedupRecordPath( final String basePath, final String actionSetId, final String entityType) { return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType); @@ -156,65 +100,6 @@ public class DedupUtility { return String.format("%s/%s/%s_mergerel", basePath, actionSetId, entityType); } - private static Double sim(Author a, Author b) { - - final Person pa = parse(a); - final Person pb = parse(b); - - if (pa.isAccurate() & pb.isAccurate()) { - return new JaroWinkler() - .score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())); - } else { - return new JaroWinkler() - .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname())); - } - } - - private static String normalize(final String s) { - return nfd(s) - .toLowerCase() - // do not compact the regexes in a single expression, would cause StackOverflowError - // in case - // of large input strings - .replaceAll("(\\W)+", " ") - .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") - .replaceAll("(\\p{Punct})+", " ") - .replaceAll("(\\d)+", " ") - .replaceAll("(\\n)+", " ") - .trim(); - } - - private static String nfd(final String s) { - return Normalizer.normalize(s, Normalizer.Form.NFD); - } - - private static Person parse(Author author) { - if (StringUtils.isNotBlank(author.getSurname())) { - return new Person(author.getSurname() + ", " + author.getName(), false); - } else { - return new Person(author.getFullname(), false); - } - } - - private static int countAuthorsPids(List authors) { - if (authors == null) - return 0; - - return (int) authors.stream().filter(DedupUtility::hasPid).count(); - } - - private static int authorsSize(List authors) { - if (authors == null) - return 0; - return authors.size(); - } - - private static boolean hasPid(Author a) { - if (a == null || a.getPid() == null || a.getPid().size() == 0) - return false; - return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); - } - public static List getConfigurations(String isLookUpUrl, String orchestrator) throws ISLookUpException, DocumentException { final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookUpUrl); diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java new file mode 100644 index 000000000..0a3bf62ea --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java @@ -0,0 +1,139 @@ + +package eu.dnetlib.dhp.oa.dedup; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.io.Serializable; +import java.nio.file.Paths; +import java.util.*; + +import org.codehaus.jackson.map.ObjectMapper; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.pace.util.MapDocumentUtil; +import scala.Tuple2; + +public class EntityMergerTest implements Serializable { + + List> publications; + + String testEntityBasePath; + DataInfo dataInfo; + String dedupId = "dedup_id"; + Publication pub_top; + + @BeforeEach + public void setUp() throws Exception { + + testEntityBasePath = Paths + .get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI()) + .toFile() + .getAbsolutePath(); + + publications = readSample(testEntityBasePath + "/publication_merge.json", Publication.class); + + pub_top = getTopPub(publications); + + dataInfo = setDI(); + + } + + @Test + public void publicationMergerTest() throws InstantiationException, IllegalAccessException { + + Publication pub_merged = DedupRecordFactory + .entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class); + + assertEquals(dedupId, pub_merged.getId()); + + assertEquals(pub_merged.getJournal(), pub_top.getJournal()); + assertEquals(pub_merged.getBestaccessright(), pub_top.getBestaccessright()); + assertEquals(pub_merged.getResulttype(), pub_top.getResulttype()); + assertEquals(pub_merged.getLanguage(), pub_merged.getLanguage()); + assertEquals(pub_merged.getPublisher(), pub_top.getPublisher()); + assertEquals(pub_merged.getEmbargoenddate(), pub_top.getEmbargoenddate()); + assertEquals(pub_merged.getResourcetype().getClassid(), "0004"); + assertEquals(pub_merged.getDateoftransformation(), pub_top.getDateoftransformation()); + assertEquals(pub_merged.getOaiprovenance(), pub_top.getOaiprovenance()); + assertEquals(pub_merged.getDateofcollection(), pub_top.getDateofcollection()); + assertEquals(pub_merged.getInstance().size(), 3); + assertEquals(pub_merged.getCountry().size(), 2); + assertEquals(pub_merged.getSubject().size(), 0); + assertEquals(pub_merged.getTitle().size(), 2); + assertEquals(pub_merged.getRelevantdate().size(), 0); + assertEquals(pub_merged.getDescription().size(), 0); + assertEquals(pub_merged.getSource().size(), 0); + assertEquals(pub_merged.getFulltext().size(), 0); + assertEquals(pub_merged.getFormat().size(), 0); + assertEquals(pub_merged.getContributor().size(), 0); + assertEquals(pub_merged.getCoverage().size(), 0); + assertEquals(pub_merged.getContext().size(), 0); + assertEquals(pub_merged.getExternalReference().size(), 0); + assertEquals(pub_merged.getOriginalId().size(), 3); + assertEquals(pub_merged.getCollectedfrom().size(), 3); + assertEquals(pub_merged.getPid().size(), 1); + assertEquals(pub_merged.getExtraInfo().size(), 0); + + // verify datainfo + assertEquals(pub_merged.getDataInfo(), dataInfo); + + // verify datepicker + assertEquals(pub_merged.getDateofacceptance().getValue(), "2018-09-30"); + + // verify authors + assertEquals(pub_merged.getAuthor().size(), 9); + assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4); + } + + public DataInfo setDI() { + DataInfo dataInfo = new DataInfo(); + dataInfo.setTrust("0.9"); + dataInfo.setDeletedbyinference(false); + dataInfo.setInferenceprovenance("testing"); + dataInfo.setInferred(true); + return dataInfo; + } + + public Publication getTopPub(List> publications) { + + Double maxTrust = 0.0; + Publication maxPub = new Publication(); + for (Tuple2 publication : publications) { + Double pubTrust = Double.parseDouble(publication._2().getDataInfo().getTrust()); + if (pubTrust > maxTrust) { + maxTrust = pubTrust; + maxPub = publication._2(); + } + } + return maxPub; + } + + public List> readSample(String path, Class clazz) { + List> res = new ArrayList<>(); + BufferedReader reader; + try { + reader = new BufferedReader(new FileReader(path)); + String line = reader.readLine(); + while (line != null) { + res + .add( + new Tuple2<>( + MapDocumentUtil.getJPathString("$.id", line), + new ObjectMapper().readValue(line, clazz))); + // read next line + line = reader.readLine(); + } + reader.close(); + } catch (IOException e) { + e.printStackTrace(); + } + + return res; + } + +} diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/MergeAuthorTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/MergeAuthorTest.java deleted file mode 100644 index a217a2657..000000000 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/MergeAuthorTest.java +++ /dev/null @@ -1,54 +0,0 @@ - -package eu.dnetlib.dhp.oa.dedup; - -import java.io.IOException; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; - -import org.apache.commons.io.IOUtils; -import org.codehaus.jackson.map.ObjectMapper; -import org.junit.jupiter.api.BeforeEach; - -import eu.dnetlib.dhp.schema.oaf.Publication; - -public class MergeAuthorTest { - - private List publicationsToMerge; - private final ObjectMapper mapper = new ObjectMapper(); - - @BeforeEach - public void setUp() throws Exception { - final String json = IOUtils - .toString( - this.getClass().getResourceAsStream("/eu/dnetlib/dhp/dedup/json/authors_merge.json")); - - publicationsToMerge = Arrays - .asList(json.split("\n")) - .stream() - .map( - s -> { - try { - return mapper.readValue(s, Publication.class); - } catch (IOException e) { - throw new RuntimeException(e); - } - }) - .collect(Collectors.toList()); - } - - // FIX ME Michele DB this tests doesn't work - // @Test - public void test() throws Exception { - Publication dedup = new Publication(); - - publicationsToMerge - .forEach( - p -> { - dedup.mergeFrom(p); - dedup.setAuthor(DedupUtility.mergeAuthor(dedup.getAuthor(), p.getAuthor())); - }); - - System.out.println(mapper.writeValueAsString(dedup)); - } -} diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/publication_merge.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/publication_merge.json new file mode 100644 index 000000000..015f9294a --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/publication_merge.json @@ -0,0 +1,3 @@ +{"context": [], "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive", "classname": "sysimport:crosswalk:datasetarchive", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "dedup-similarity-result-levenstein", "invisible": false, "trust": "0.95"}, "resourcetype": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "pid": [], "contributor": [], "resulttype": {"classid": "publication", "classname": "publication", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "relevantdate": [], "collectedfrom": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "Journal.fi", "key": "10|openaire____::6eef8049d0feedc089ee009abca55e35"}], "id": "50|a89337edbe55::4930db9e954866d70916cbfba9f81f97", "subject": [], "instance": [{"refereed": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "hostedby": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "Journal.fi", "key": "10|openaire____::6eef8049d0feedc089ee009abca55e35"}, "processingchargeamount": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "license": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "url": [], "distributionlocation": "", "processingchargecurrency": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2016-01-01"}, "collectedfrom": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "Journal.fi", "key": "10|openaire____::6eef8049d0feedc089ee009abca55e35"}, "accessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0001", "classname": "Article", "schemename": "dnet:dataCite_resource", "schemeid": "dnet:dataCite_resource"}}], "embargoenddate": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "lastupdatetimestamp": 0, "author": [{"surname": "Go\\u0308tz", "name": "Julia", "pid": [], "rank": 5, "affiliation": [], "fullname": "G\\u00f6tz, Julia"}, {"surname": "Wolff", "name": "Stephan", "pid": [], "rank": 6, "affiliation": [], "fullname": "Wolff, Stephan"}, {"surname": "Jansen", "name": "Olav", "pid": [], "rank": 7, "affiliation": [], "fullname": "Jansen, Olav"}, {"surname": "Dressler", "name": "Dirk", "pid": [{"qualifier": {"classid": "ORCID", "classname": "ORCID"}, "value": "0000-0000-0656-9999"},{"qualifier": {"classid": "id", "classname": "id"}, "value": "987654321"}], "rank": 8, "affiliation": [], "fullname": "Dressler, Dirk"}, {"surname": "Schneider", "name": "Susanne A.", "pid": [], "rank": 9, "affiliation": [], "fullname": "Schneider, Susanne A."}], "source": [], "dateofcollection": "2019-11-05T14:49:22.351Z", "fulltext": [], "dateoftransformation": "2019-11-05T16:10:58.988Z", "description": [], "format": [], "journal": {"issnPrinted": "1459-6067", "conferencedate": "", "conferenceplace": "", "name": "Agricultural and Food Science", "edition": "", "iss": "3", "sp": "", "vol": "27", "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "issnOnline": "1795-1895", "ep": "", "issnLinking": ""}, "coverage": [], "publisher": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "language": {"classid": "eng", "classname": "English", "schemename": "dnet:languages", "schemeid": "dnet:languages"}, "bestaccessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "country": [], "extraInfo": [], "originalId": [], "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2018-09-30"}, "title": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Altered brain activation in a reversal learning task unmasks adaptive changes in cognitive control in writer's cramp"}]} +{"context": [], "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "dedup-similarity-result-levenstein", "invisible": false, "trust": "0.9"}, "resourcetype": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "pid": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "qualifier": {"classid": "doi", "classname": "doi", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.1016/j.nicl.2015.11.006"}], "contributor": [], "resulttype": {"classid": "publication", "classname": "publication", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "relevantdate": [], "collectedfrom": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "BASE (Open Access Aggregator)", "key": "10|openaire____::df45502607927471ecf8a6ae83683ff5"}], "id": "50|base_oa_____::0968af610a356656706657e4f234b340", "subject": [], "instance": [{"refereed": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "hostedby": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "NeuroImage: Clinical", "key": "10|doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a"}, "processingchargeamount": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "license": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "http://creativecommons.org/licenses/by-nc-nd/4.0/"}, "url": ["http://dx.doi.org/10.1016/j.nicl.2015.11.006"], "distributionlocation": "", "processingchargecurrency": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2016-01-01"}, "collectedfrom": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "BASE (Open Access Aggregator)", "key": "10|openaire____::df45502607927471ecf8a6ae83683ff5"}, "accessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0001", "classname": "Article", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}}], "embargoenddate": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "lastupdatetimestamp": 0, "author": [{"surname": "Zeuner", "name": "Kirsten E.", "pid": [], "rank": 1, "affiliation": [], "fullname": "Zeuner, Kirsten E."}, {"surname": "Knutzen", "name": "Arne", "pid": [], "rank": 2, "affiliation": [], "fullname": "Knutzen, Arne"}, {"surname": "Granert", "name": "Oliver", "pid": [{"qualifier": {"classid": "ORCID", "classname": "ORCID"}, "value": "0000-0002-0656-1023"}], "rank": 3, "affiliation": [], "fullname": "Granert, Oliver"}, {"surname": "Sablowsky", "name": "Simone", "pid": [], "rank": 4, "affiliation": [], "fullname": "Sablowsky, Simone"}, {"surname": "Go\\u0308tz", "name": "Julia", "pid": [], "rank": 5, "affiliation": [], "fullname": "G\\u00f6tz, Julia"}, {"surname": "Wolff", "name": "Stephan", "pid": [], "rank": 6, "affiliation": [], "fullname": "Wolff, Stephan"}, {"surname": "Jansen", "name": "Olav", "pid": [], "rank": 7, "affiliation": [], "fullname": "Jansen, Olav"}, {"surname": "Dressler", "name": "Dirk", "pid": [], "rank": 8, "affiliation": [], "fullname": "Dressler, Dirk"}, {"surname": "Schneider", "name": "Susanne A.", "pid": [], "rank": 9, "affiliation": [], "fullname": "Schneider, Susanne A."}, {"surname": "Klein", "name": "Christine", "pid": [], "rank": 10, "affiliation": [], "fullname": "Klein, Christine"}, {"surname": "Deuschl", "name": "Gu\\u0308nther", "pid": [], "rank": 11, "affiliation": [], "fullname": "Deuschl, G\\u00fcnther"}, {"surname": "Eimeren", "name": "Thilo", "pid": [], "rank": 12, "affiliation": [], "fullname": "van Eimeren, Thilo"}, {"surname": "Witt", "name": "Karsten", "pid": [], "rank": 13, "affiliation": [], "fullname": "Witt, Karsten"}], "source": [], "dateofcollection": "2017-07-27T19:04:09.131Z", "fulltext": [], "dateoftransformation": "2019-01-23T10:15:19.582Z", "description": [], "format": [], "journal": {"issnPrinted": "2213-1582", "conferencedate": "", "conferenceplace": "", "name": "NeuroImage: Clinical", "edition": "", "iss": "", "sp": "63", "vol": "10", "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "issnOnline": "", "ep": "70", "issnLinking": ""}, "coverage": [], "publisher": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "Elsevier BV"}, "language": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "bestaccessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "country": [{"classid": "IT", "classname": "Italy", "schemeid": "dnet:countries", "schemename": "dnet:countries"}], "extraInfo": [], "originalId": ["10.1016/j.nicl.2015.11.006"], "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2016-01-01"}, "title": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Altered brain activation in a reversal learning task unmasks adaptive changes in cognitive control in writer's cramp"}]} +{"context": [], "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive", "classname": "sysimport:crosswalk:datasetarchive", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "dedup-similarity-result-levenstein", "invisible": false, "trust": "0.9"}, "resourcetype": {"classid": "0004", "classname": "Conference object", "schemename": "dnet:dataCite_resource", "schemeid": "dnet:dataCite_resource"}, "pid": [], "contributor": [], "resulttype": {"classid": "publication", "classname": "publication", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "relevantdate": [], "collectedfrom": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "CRIS UNS (Current Research Information System University of Novi Sad)", "key": "10|CRIS_UNS____::f66f1bd369679b5b077dcdf006089556"}], "id": "50|CrisUnsNoviS::9f9d014eea45dab432cab636c4c9cf39", "subject": [], "instance": [{"refereed": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "hostedby": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "CRIS UNS (Current Research Information System University of Novi Sad)", "key": "10|CRIS_UNS____::f66f1bd369679b5b077dcdf006089556"}, "processingchargeamount": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "license": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "url": ["https://www.cris.uns.ac.rs/record.jsf?recordId=113444&source=OpenAIRE&language=en"], "distributionlocation": "", "processingchargecurrency": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2019-01-01"}, "collectedfrom": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "CRIS UNS (Current Research Information System University of Novi Sad)", "key": "10|CRIS_UNS____::f66f1bd369679b5b077dcdf006089556"}, "accessright": {"classid": "UNKNOWN", "classname": "UNKNOWN", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0004", "classname": "Conference object", "schemename": "dnet:dataCite_resource", "schemeid": "dnet:dataCite_resource"}}], "embargoenddate": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "lastupdatetimestamp": 0, "author": [{"surname": "Zeuner", "name": "Kirsten E.", "pid": [], "rank": 1, "affiliation": [], "fullname": "Zeuner, Kirsten E."}, {"surname": "Knutzen", "name": "Arne", "pid": [], "rank": 2, "affiliation": [], "fullname": "Knutzen, Arne"}, {"surname": "Granert", "name": "Oliver", "pid": [{"qualifier": {"classid": "ORCID", "classname": "ORCID"}, "value": "0000-0002-0656-1023"}, {"qualifier": {"classid": "pubmed", "classname": "pubmed"}, "value": "pubmed.it"}], "rank": 3, "affiliation": [], "fullname": "Granert, Oliver"}, {"surname": "Sablowsky", "name": "Simone", "pid": [{"qualifier": {"classid": "id", "classname": "id"}, "value": "12345678"}], "rank": 4, "affiliation": [], "fullname": "Sablowsky, Simone"}, {"surname": "Go\\u0308tz", "name": "Julia", "pid": [], "rank": 5, "affiliation": [], "fullname": "G\\u00f6tz, Julia"}, {"surname": "Wolff", "name": "Stephan", "pid": [], "rank": 6, "affiliation": [], "fullname": "Wolff, Stephan"}, {"surname": "Jansen", "name": "Olav", "pid": [{"qualifier": {"classid": "ORCID", "classname": "ORCID"}, "value": "0000-0000-0656-1023"},{"qualifier": {"classid": "id", "classname": "id"}, "value": "987654321"}], "rank": 7, "affiliation": [], "fullname": "Jansen, Olav"}, {"surname": "Dressler", "name": "Dirk", "pid": [], "rank": 8, "affiliation": [], "fullname": "Dressler, Dirk"}, {"surname": "Schneider", "name": "Susanne A.", "pid": [], "rank": 9, "affiliation": [], "fullname": "Schneider, Susanne A."}], "source": [], "dateofcollection": "2020-03-10T15:05:38.685Z", "fulltext": [], "dateoftransformation": "2020-03-11T20:11:13.15Z", "description": [], "format": [], "journal": {"issnPrinted": "", "conferencedate": "", "conferenceplace": "", "name": "", "edition": "", "iss": "", "sp": "", "vol": "", "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "issnOnline": "", "ep": "", "issnLinking": ""}, "coverage": [], "publisher": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "language": {"classid": "en", "classname": "en", "schemename": "dnet:languages", "schemeid": "dnet:languages"}, "bestaccessright": {"classid": "UNKNOWN", "classname": "not available", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "country": [{"classid": "FI", "classname": "Finland", "schemeid": "dnet:countries", "schemename": "dnet:countries"}], "extraInfo": [], "originalId": ["(BISIS)113444", "https://www.cris.uns.ac.rs/record.jsf?recordId=113444&source=OpenAIRE&language=en"], "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2016-01-01"}, "title": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Antichains of copies of ultrahomogeneous structures"}]} \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultToOrganizationJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultToOrganizationJobTest.java index 435b76605..cfcccc5f0 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultToOrganizationJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultToOrganizationJobTest.java @@ -99,6 +99,7 @@ public class ResultToOrganizationJobTest { .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); Assertions.assertEquals(0, tmp.count()); + FileUtils.deleteDirectory(workingDir.toFile()); } /** @@ -171,6 +172,7 @@ public class ResultToOrganizationJobTest { + "(target = '20|opendoar____::124266ebc4ece2934eb80edfda3f2091' " + "or target = '20|dedup_wf_001::5168917a6aeeea55269daeac1af2ecd2')") .count()); + FileUtils.deleteDirectory(workingDir.toFile()); } @Test @@ -266,5 +268,6 @@ public class ResultToOrganizationJobTest { "relclass = 'isAuthorInstitutionOf' and " + "substring(source, 1,2) = '20' and substring(target, 1, 2) = '50'") .count()); + FileUtils.deleteDirectory(workingDir.toFile()); } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index b9c4e6c80..c4639eb44 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -10,7 +10,16 @@ import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listFields; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.oaiIProvenance; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; -import static eu.dnetlib.dhp.schema.common.ModelConstants.*; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY; +import static eu.dnetlib.dhp.schema.common.ModelConstants.ORP_DEFAULT_RESULTTYPE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME; +import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.REPOSITORY_PROVENANCE_ACTIONS; +import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT; +import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE; import java.util.ArrayList; import java.util.Arrays; @@ -50,6 +59,10 @@ public abstract class AbstractMdRecordToOafMapper { protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4"; protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; + protected static final Qualifier ORCID_PID_TYPE = qualifier( + "ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES); + protected static final Qualifier MAG_PID_TYPE = qualifier( + "MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES); protected static final Map nsContext = new HashMap<>(); @@ -75,8 +88,7 @@ public abstract class AbstractMdRecordToOafMapper { DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); final Document doc = DocumentHelper - .parseText( - xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)); + .parseText(xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)); final String type = doc.valueOf("//dr:CobjCategory/@type"); final KeyValue collectedFrom = getProvenanceDatasource( @@ -103,7 +115,7 @@ public abstract class AbstractMdRecordToOafMapper { } } - private KeyValue getProvenanceDatasource(Document doc, String xpathId, String xpathName) { + private KeyValue getProvenanceDatasource(final Document doc, final String xpathId, final String xpathName) { final String dsId = doc.valueOf(xpathId); final String dsName = doc.valueOf(xpathName); @@ -111,9 +123,7 @@ public abstract class AbstractMdRecordToOafMapper { return null; } - return keyValue( - createOpenaireId(10, dsId, true), - dsName); + return keyValue(createOpenaireId(10, dsId, true), dsName); } protected List createOafs( @@ -211,8 +221,14 @@ public abstract class AbstractMdRecordToOafMapper { return res; } - protected Relation getRelation(String source, String target, String relType, String subRelType, String relClass, - KeyValue collectedFrom, DataInfo info, long lastUpdateTimestamp) { + protected Relation getRelation(final String source, + final String target, + final String relType, + final String subRelType, + final String relClass, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { final Relation rel = new Relation(); rel.setRelType(relType); rel.setSubRelType(subRelType); @@ -289,7 +305,10 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); protected abstract List prepareInstances( - Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); + Document doc, + DataInfo info, + KeyValue collectedfrom, + KeyValue hostedby); protected abstract List> prepareSources(Document doc, DataInfo info); @@ -314,13 +333,16 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract List prepareAuthors(Document doc, DataInfo info); protected abstract List> prepareOtherResearchProductTools( - Document doc, DataInfo info); + Document doc, + DataInfo info); protected abstract List> prepareOtherResearchProductContactGroups( - Document doc, DataInfo info); + Document doc, + DataInfo info); protected abstract List> prepareOtherResearchProductContactPersons( - Document doc, DataInfo info); + Document doc, + DataInfo info); protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info); @@ -329,7 +351,8 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract List prepareSoftwareLicenses(Document doc, DataInfo info); protected abstract List> prepareSoftwareDocumentationUrls( - Document doc, DataInfo info); + Document doc, + DataInfo info); protected abstract List prepareDatasetGeoLocations(Document doc, DataInfo info); @@ -358,26 +381,17 @@ public abstract class AbstractMdRecordToOafMapper { final String vol = n.valueOf("@vol"); final String edition = n.valueOf("@edition"); if (StringUtils.isNotBlank(name)) { - return journal( - name, - issnPrinted, - issnOnline, - issnLinking, - ep, - iss, - sp, - vol, - edition, - null, - null, - info); + return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); } } return null; } protected Qualifier prepareQualifier( - final Node node, final String xpath, final String schemeId, final String schemeName) { + final Node node, + final String xpath, + final String schemeId, + final String schemeName) { final String classId = node.valueOf(xpath); final String className = code2name.get(classId); return qualifier(classId, className, schemeId, schemeName); @@ -401,7 +415,10 @@ public abstract class AbstractMdRecordToOafMapper { } protected List prepareListStructProps( - final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) { + final Node node, + final String xpath, + final Qualifier qualifier, + final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; @@ -411,19 +428,17 @@ public abstract class AbstractMdRecordToOafMapper { } protected List prepareListStructProps( - final Node node, final String xpath, final DataInfo info) { + final Node node, + final String xpath, + final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; res .add( structuredProperty( - n.getText(), - n.valueOf("@classid"), - n.valueOf("@classname"), - n.valueOf("@schemeid"), - n.valueOf("@schemename"), - info)); + n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), + n.valueOf("@schemename"), info)); } return res; } @@ -449,8 +464,7 @@ public abstract class AbstractMdRecordToOafMapper { final Node n = doc.selectSingleNode("//oaf:datainfo"); if (n == null) { - return dataInfo( - false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); + return dataInfo(false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); } final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); @@ -464,12 +478,8 @@ public abstract class AbstractMdRecordToOafMapper { final String trust = n.valueOf("./oaf:trust"); return dataInfo( - deletedbyinference, - inferenceprovenance, - inferred, - false, - qualifier(paClassId, paClassName, paSchemeId, paSchemeName), - trust); + deletedbyinference, inferenceprovenance, inferred, false, + qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); } protected Field prepareField(final Node node, final String xpath, final DataInfo info) { @@ -477,7 +487,9 @@ public abstract class AbstractMdRecordToOafMapper { } protected List> prepareListFields( - final Node node, final String xpath, final DataInfo info) { + final Node node, + final String xpath, + final DataInfo info) { return listFields(info, prepareListString(node, xpath)); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index ed09016da..af9fe7197 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -1,10 +1,19 @@ package eu.dnetlib.dhp.oa.graph.raw; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*; -import static eu.dnetlib.dhp.schema.common.ModelConstants.*; +import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; +import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; +import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO; +import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DATASET; +import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT; -import java.util.*; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; @@ -15,8 +24,15 @@ import org.dom4j.Node; import com.google.common.collect.Lists; import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson; -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.GeoLocation; +import eu.dnetlib.dhp.schema.oaf.Instance; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; public class OafToOafMapper extends AbstractMdRecordToOafMapper { @@ -39,14 +55,25 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { author.setSurname(p.getNormalisedSurname()); } - final String pid = e.attributeValue("nameIdentifier"); - final String pidType = e.attributeValue("nameIdentifierScheme"); + final String pid = e.valueOf("./@nameIdentifier"); + final String type = e + .valueOf("./@nameIdentifierScheme") + .trim() + .toUpperCase() + .replaceAll(" ", "") + .replaceAll("_", ""); author.setPid(new ArrayList<>()); - if (StringUtils.isNotBlank(pid) && StringUtils.isNotBlank(pidType)) { - author - .getPid() - .add(structuredProperty(pid, qualifier(pidType, pidType, DNET_PID_TYPES, DNET_PID_TYPES), info)); + + if (StringUtils.isNotBlank(pid)) { + if (type.startsWith("ORCID")) { + final String cleanedId = pid + .replaceAll("http://orcid.org/", "") + .replaceAll("https://orcid.org/", ""); + author.getPid().add(structuredProperty(cleanedId, ORCID_PID_TYPE, info)); + } else if (type.startsWith("MAGID")) { + author.getPid().add(structuredProperty(pid, MAG_PID_TYPE, info)); + } } res.add(author); @@ -104,28 +131,21 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final Instance instance = new Instance(); instance .setInstancetype( - prepareQualifier( - doc, - "//dr:CobjCategory", - DNET_PUBLICATION_RESOURCE, - DNET_PUBLICATION_RESOURCE)); + prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); instance.setCollectedfrom(collectedfrom); instance.setHostedby(hostedby); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); instance - .setAccessright( - prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES, DNET_ACCESS_MODES)); + .setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES, DNET_ACCESS_MODES)); instance.setLicense(field(doc.valueOf("//oaf:license"), info)); instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); instance - .setProcessingchargeamount( - field(doc.valueOf("//oaf:processingchargeamount"), info)); + .setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); instance - .setProcessingchargecurrency( - field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); + .setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); - List nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier")); + final List nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier")); instance .setUrl( nodes @@ -158,19 +178,22 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { @Override protected Field prepareSoftwareCodeRepositoryUrl( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return null; // NOT PRESENT IN OAF } @Override protected List prepareSoftwareLicenses( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return new ArrayList<>(); // NOT PRESENT IN OAF } @Override protected List> prepareSoftwareDocumentationUrls( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return new ArrayList<>(); // NOT PRESENT IN OAF } @@ -182,13 +205,15 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { @Override protected Field prepareDatasetMetadataVersionNumber( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return null; // NOT PRESENT IN OAF } @Override protected Field prepareDatasetLastMetadataUpdate( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return null; // NOT PRESENT IN OAF } @@ -216,19 +241,22 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { @Override protected List> prepareOtherResearchProductTools( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return new ArrayList<>(); // NOT PRESENT IN OAF } @Override protected List> prepareOtherResearchProductContactGroups( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return new ArrayList<>(); // NOT PRESENT IN OAF } @Override protected List> prepareOtherResearchProductContactPersons( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return new ArrayList<>(); // NOT PRESENT IN OAF } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 30b980c42..9c74c4a93 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -4,16 +4,31 @@ package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; -import static eu.dnetlib.dhp.schema.common.ModelConstants.*; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_DATE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_RESOURCE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.HAS_PARTS; +import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PART_OF; +import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_SUPPLEMENTED_BY; +import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_SUPPLEMENT_TO; +import static eu.dnetlib.dhp.schema.common.ModelConstants.PART; +import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT; +import static eu.dnetlib.dhp.schema.common.ModelConstants.SUPPLEMENT; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.Node; import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson; -import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Field; @@ -22,7 +37,6 @@ import eu.dnetlib.dhp.schema.oaf.Instance; import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.Qualifier; -import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @@ -48,7 +62,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final String fullname = n.valueOf("./datacite:creatorName"); author.setFullname(fullname); - PacePerson pp = new PacePerson(fullname, false); + final PacePerson pp = new PacePerson(fullname, false); final String name = n.valueOf("./datacite:givenName"); if (StringUtils.isBlank(name) & pp.isAccurate()) { author.setName(pp.getNormalisedFirstName()); @@ -63,6 +77,10 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { author.setSurname(surname); } + if (StringUtils.isBlank(author.getFullname())) { + author.setFullname(String.format("%s, %s", author.getSurname(), author.getName())); + } + author.setAffiliation(prepareListFields(n, "./datacite:affiliation", info)); author.setPid(preparePids(n, info)); author.setRank(pos++); @@ -74,13 +92,21 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { private List preparePids(final Node n, final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : n.selectNodes("./datacite:nameIdentifier")) { - res - .add( - structuredProperty( - ((Node) o).getText(), - prepareQualifier( - (Node) o, "./@nameIdentifierScheme", DNET_PID_TYPES, DNET_PID_TYPES), - info)); + + final String id = ((Node) o).getText(); + final String type = ((Node) o) + .valueOf("./@nameIdentifierScheme") + .trim() + .toUpperCase() + .replaceAll(" ", "") + .replaceAll("_", ""); + + if (type.startsWith("ORCID")) { + final String cleanedId = id.replaceAll("http://orcid.org/", "").replaceAll("https://orcid.org/", ""); + res.add(structuredProperty(cleanedId, ORCID_PID_TYPE, info)); + } else if (type.startsWith("MAGID")) { + res.add(structuredProperty(id, MAG_PID_TYPE, info)); + } } return res; } @@ -95,21 +121,18 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final Instance instance = new Instance(); instance .setInstancetype( - prepareQualifier( - doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); + prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); instance.setCollectedfrom(collectedfrom); instance.setHostedby(hostedby); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); instance - .setAccessright( - prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES, DNET_ACCESS_MODES)); + .setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES, DNET_ACCESS_MODES)); instance.setLicense(field(doc.valueOf("//oaf:license"), info)); instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); instance - .setProcessingchargecurrency( - field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); + .setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); final Set url = new HashSet<>(); for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) { @@ -149,11 +172,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { res .add( structuredProperty( - ((Node) o).getText(), - "UNKNOWN", - "UNKNOWN", - DNET_DATA_CITE_DATE, - DNET_DATA_CITE_DATE, + ((Node) o).getText(), "UNKNOWN", "UNKNOWN", DNET_DATA_CITE_DATE, DNET_DATA_CITE_DATE, info)); } } @@ -197,53 +216,52 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @Override protected List> prepareOtherResearchProductTools( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return new ArrayList<>(); // Not present in ODF ??? } @Override protected List> prepareOtherResearchProductContactGroups( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return prepareListFields( - doc, - "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", - info); + doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info); } @Override protected List> prepareOtherResearchProductContactPersons( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return prepareListFields( - doc, - "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", - info); + doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info); } @Override protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { - return prepareQualifier( - doc, "//datacite:format", "dnet:programming_languages", "dnet:programming_languages"); + return prepareQualifier(doc, "//datacite:format", "dnet:programming_languages", "dnet:programming_languages"); } @Override protected Field prepareSoftwareCodeRepositoryUrl( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return null; // Not present in ODF ??? } @Override protected List prepareSoftwareLicenses( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return new ArrayList<>(); // Not present in ODF ??? } @Override protected List> prepareSoftwareDocumentationUrls( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return prepareListFields( - doc, - "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", - info); + doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info); } // DATASETS @@ -264,13 +282,15 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @Override protected Field prepareDatasetMetadataVersionNumber( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return null; // Not present in ODF ??? } @Override protected Field prepareDatasetLastMetadataUpdate( - final Document doc, final DataInfo info) { + final Document doc, + final DataInfo info) { return prepareField(doc, "//datacite:date[@dateType='Updated']", info); } @@ -346,9 +366,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @Override protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { return prepareQualifier( - doc, - "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", - DNET_DATA_CITE_RESOURCE, + doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE, DNET_DATA_CITE_RESOURCE); } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOrganizations.sql b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOrganizations.sql index aeb04aff9..d13bd4342 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOrganizations.sql +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOrganizations.sql @@ -22,12 +22,13 @@ SELECT '' AS inferenceprovenance, d.id AS collectedfromid, d.officialname AS collectedfromname, - o.country || '@@@' || o.country || '@@@dnet:countries@@@dnet:countries' AS country, + o.country || '@@@' || COALESCE(cntr.name,o.country) || '@@@dnet:countries@@@dnet:countries' AS country, 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction, - ARRAY[]::text[] AS pid + FROM dsm_organizations o LEFT OUTER JOIN dsm_datasources d ON (d.id = o.collectedfrom) + LEFT OUTER JOIN class cntr ON (cntr.code = o.country) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 5a006e351..631e7235e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -21,7 +21,14 @@ import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Software; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; @ExtendWith(MockitoExtension.class) public class MappersTest { @@ -54,13 +61,13 @@ public class MappersTest { assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); assertTrue(p.getAuthor().size() > 0); - Optional author = p + final Optional author = p .getAuthor() .stream() .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) .findFirst(); assertTrue(author.isPresent()); - StructuredProperty pid = author + final StructuredProperty pid = author .get() .getPid() .stream() @@ -68,7 +75,7 @@ public class MappersTest { .get(); assertEquals("0000-0001-6651-1178", pid.getValue()); assertEquals("ORCID", pid.getQualifier().getClassid()); - assertEquals("ORCID", pid.getQualifier().getClassname()); + assertEquals("Open Researcher and Contributor ID", pid.getQualifier().getClassname()); assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid()); assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename()); assertEquals("Votsi,Nefta", author.get().getFullname()); @@ -121,13 +128,13 @@ public class MappersTest { assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue())); assertTrue(d.getAuthor().size() > 0); - Optional author = d + final Optional author = d .getAuthor() .stream() .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) .findFirst(); assertTrue(author.isPresent()); - StructuredProperty pid = author + final StructuredProperty pid = author .get() .getPid() .stream() @@ -135,7 +142,7 @@ public class MappersTest { .get(); assertEquals("0000-0001-9074-1619", pid.getValue()); assertEquals("ORCID", pid.getQualifier().getClassid()); - assertEquals("ORCID", pid.getQualifier().getClassname()); + assertEquals("Open Researcher and Contributor ID", pid.getQualifier().getClassname()); assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid()); assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename()); assertEquals("Baracchini, Theo", author.get().getFullname()); @@ -143,13 +150,13 @@ public class MappersTest { assertEquals("Theo", author.get().getName()); assertEquals(1, author.get().getAffiliation().size()); - Optional> opAff = author + final Optional> opAff = author .get() .getAffiliation() .stream() .findFirst(); assertTrue(opAff.isPresent()); - Field affiliation = opAff.get(); + final Field affiliation = opAff.get(); assertEquals("ISTI-CNR", affiliation.getValue()); assertTrue(d.getSubject().size() > 0); diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java index dbdc54fc0..72d68a389 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java @@ -3,7 +3,9 @@ package eu.dnetlib.dhp.oa.provision; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import java.util.HashSet; import java.util.Optional; +import java.util.Set; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; @@ -19,8 +21,10 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Splitter; import com.google.common.collect.Iterables; import com.google.common.collect.Iterators; +import com.google.common.collect.Sets; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; @@ -58,6 +62,8 @@ public class PrepareRelationsJob { public static final int MAX_RELS = 100; + public static final int DEFAULT_NUM_PARTITIONS = 3000; + public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils .toString( @@ -79,6 +85,24 @@ public class PrepareRelationsJob { String outputPath = parser.get("outputPath"); log.info("outputPath: {}", outputPath); + int relPartitions = Optional + .ofNullable(parser.get("relPartitions")) + .map(Integer::valueOf) + .orElse(DEFAULT_NUM_PARTITIONS); + log.info("relPartitions: {}", relPartitions); + + Set relationFilter = Optional + .ofNullable(parser.get("relationFilter")) + .map(s -> Sets.newHashSet(Splitter.on(",").split(s))) + .orElse(new HashSet<>()); + log.info("relationFilter: {}", relationFilter); + + int maxRelations = Optional + .ofNullable(parser.get("maxRelations")) + .map(Integer::valueOf) + .orElse(MAX_RELS); + log.info("maxRelations: {}", maxRelations); + SparkConf conf = new SparkConf(); runWithSparkSession( @@ -86,25 +110,74 @@ public class PrepareRelationsJob { isSparkSessionManaged, spark -> { removeOutputDir(spark, outputPath); - prepareRelationsFromPaths(spark, inputRelationsPath, outputPath); + prepareRelationsRDD( + spark, inputRelationsPath, outputPath, relationFilter, relPartitions, maxRelations); }); } - private static void prepareRelationsFromPaths( - SparkSession spark, String inputRelationsPath, String outputPath) { + /** + * Dataset based implementation that prepares the graph relations by limiting the number of outgoing links and + * filtering the relation types according to the given criteria. + * + * @param spark the spark session + * @param inputRelationsPath source path for the graph relations + * @param outputPath output path for the processed relations + * @param relationFilter set of relation filters applied to the `relClass` field + * @param maxRelations maximum number of allowed outgoing edges + */ + private static void prepareRelations( + SparkSession spark, String inputRelationsPath, String outputPath, Set relationFilter, + int maxRelations) { readPathRelation(spark, inputRelationsPath) .filter("dataInfo.deletedbyinference == false") + .filter((FilterFunction) rel -> !relationFilter.contains(rel.getRelClass())) .groupByKey( (MapFunction) value -> value.getSource(), Encoders.STRING()) .flatMapGroups( (FlatMapGroupsFunction) (key, values) -> Iterators - .limit(values, MAX_RELS), + .limit(values, maxRelations), Encoders.bean(SortableRelation.class)) .write() .mode(SaveMode.Overwrite) .parquet(outputPath); } + /** + * RDD based implementation that prepares the graph relations by limiting the number of outgoing links and filtering + * the relation types according to the given criteria. Moreover, outgoing links kept within the given limit are + * prioritized according to the weights indicated in eu.dnetlib.dhp.oa.provision.model.SortableRelation. + * + * @param spark the spark session + * @param inputRelationsPath source path for the graph relations + * @param outputPath output path for the processed relations + * @param relationFilter set of relation filters applied to the `relClass` field + * @param maxRelations maximum number of allowed outgoing edges + */ + // TODO work in progress + private static void prepareRelationsRDD( + SparkSession spark, String inputRelationsPath, String outputPath, Set relationFilter, int relPartitions, + int maxRelations) { + JavaRDD rels = readPathRelationRDD(spark, inputRelationsPath).repartition(relPartitions); + RelationPartitioner partitioner = new RelationPartitioner(rels.getNumPartitions()); + + // only consider those that are not virtually deleted + RDD d = rels + .filter(rel -> !rel.getDataInfo().getDeletedbyinference()) + .filter(rel -> !relationFilter.contains(rel.getRelClass())) + .mapToPair( + (PairFunction) rel -> new Tuple2<>(rel, rel)) + .groupByKey(partitioner) + .map(group -> Iterables.limit(group._2(), maxRelations)) + .flatMap(group -> group.iterator()) + .rdd(); + + spark + .createDataset(d, Encoders.bean(SortableRelation.class)) + .write() + .mode(SaveMode.Overwrite) + .parquet(outputPath); + } + /** * Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text * file, @@ -123,31 +196,6 @@ public class PrepareRelationsJob { Encoders.bean(SortableRelation.class)); } - // TODO work in progress - private static void prepareRelationsRDDFromPaths( - SparkSession spark, String inputRelationsPath, String outputPath, int numPartitions) { - JavaRDD rels = readPathRelationRDD(spark, inputRelationsPath).repartition(numPartitions); - - RDD d = rels - .filter(rel -> !rel.getDataInfo().getDeletedbyinference()) // only - // consider - // those - // that are not virtually - // deleted - .mapToPair( - (PairFunction) rel -> new Tuple2<>(rel, rel)) - .groupByKey(new RelationPartitioner(rels.getNumPartitions())) - .map(p -> Iterables.limit(p._2(), MAX_RELS)) - .flatMap(p -> p.iterator()) - .rdd(); - - spark - .createDataset(d, Encoders.bean(SortableRelation.class)) - .write() - .mode(SaveMode.Overwrite) - .parquet(outputPath); - } - private static JavaRDD readPathRelationRDD( SparkSession spark, final String inputPath) { JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java index 7c866001b..b6571b9bf 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java @@ -16,10 +16,10 @@ public class SortableRelation extends Relation implements Comparable, static { weights.put("outcome", 0); weights.put("supplement", 1); - weights.put("publicationDataset", 2); + weights.put("affiliation", 2); weights.put("relationship", 3); - weights.put("similarity", 4); - weights.put("affiliation", 5); + weights.put("publicationDataset", 4); + weights.put("similarity", 5); weights.put("provision", 6); weights.put("participation", 7); diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java index 6cb025b4f..21b526ab1 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java @@ -5,6 +5,7 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.removePrefix; import static eu.dnetlib.dhp.oa.provision.utils.XmlSerializationUtils.escapeXml; import java.io.IOException; +import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.stream.Collectors; @@ -95,7 +96,7 @@ public class TemplateFactory { .add("metadata", instancemetadata) .add( "webresources", - webresources + (webresources != null ? webresources : new ArrayList()) .stream() .filter(StringUtils::isNotBlank) .map(w -> getWebResource(w)) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java index ce1c71312..6f042b45c 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java @@ -174,6 +174,7 @@ public class XmlRecordFactory implements Serializable { entity .getCollectedfrom() .stream() + .filter(XmlRecordFactory::kvNotBlank) .map(kv -> XmlSerializationUtils.mapKeyValue("collectedfrom", kv)) .collect(Collectors.toList())); } @@ -183,6 +184,7 @@ public class XmlRecordFactory implements Serializable { entity .getOriginalId() .stream() + .filter(Objects::nonNull) .map(s -> XmlSerializationUtils.asXmlElement("originalId", s)) .collect(Collectors.toList())); } @@ -192,6 +194,7 @@ public class XmlRecordFactory implements Serializable { entity .getPid() .stream() + .filter(Objects::nonNull) .map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p)) .collect(Collectors.toList())); } @@ -213,6 +216,7 @@ public class XmlRecordFactory implements Serializable { r .getTitle() .stream() + .filter(Objects::nonNull) .map(t -> XmlSerializationUtils.mapStructuredProperty("title", t)) .collect(Collectors.toList())); } @@ -225,6 +229,7 @@ public class XmlRecordFactory implements Serializable { r .getAuthor() .stream() + .filter(Objects::nonNull) .map( a -> { final StringBuilder sb = new StringBuilder(" isNotBlank(sp.getQualifier().getClassid()) && isNotBlank(sp.getValue())) + .collect( + Collectors + .toMap( + p -> getAuthorPidType(p.getQualifier().getClassid()), + p -> p, + (p1, p2) -> p1)) + .values() .forEach( sp -> { - String pidType = XmlSerializationUtils - .escapeXml( - sp.getQualifier().getClassid()) - .replaceAll("\\W", ""); + String pidType = getAuthorPidType(sp.getQualifier().getClassid()); String pidValue = XmlSerializationUtils.escapeXml(sp.getValue()); - // ugly hack: some records - // provide swapped pidtype and - // pidvalue + // ugly hack: some records provide swapped pidtype and pidvalue if (authorPidTypes.contains(pidValue.toLowerCase().trim())) { sb.append(String.format(" %s=\"%s\"", pidValue, pidType)); } else { - pidType = pidType.replaceAll("\\W", "").replaceAll("\\d", ""); if (isNotBlank(pidType)) { sb .append( @@ -285,6 +292,7 @@ public class XmlRecordFactory implements Serializable { r .getContributor() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("contributor", c.getValue())) .collect(Collectors.toList())); } @@ -294,6 +302,7 @@ public class XmlRecordFactory implements Serializable { r .getCountry() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.mapQualifier("country", c)) .collect(Collectors.toList())); } @@ -303,6 +312,7 @@ public class XmlRecordFactory implements Serializable { r .getCoverage() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("coverage", c.getValue())) .collect(Collectors.toList())); } @@ -319,6 +329,7 @@ public class XmlRecordFactory implements Serializable { r .getDescription() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("description", c.getValue())) .collect(Collectors.toList())); } @@ -333,6 +344,7 @@ public class XmlRecordFactory implements Serializable { r .getSubject() .stream() + .filter(Objects::nonNull) .map(s -> XmlSerializationUtils.mapStructuredProperty("subject", s)) .collect(Collectors.toList())); } @@ -345,6 +357,7 @@ public class XmlRecordFactory implements Serializable { r .getRelevantdate() .stream() + .filter(Objects::nonNull) .map(s -> XmlSerializationUtils.mapStructuredProperty("relevantdate", s)) .collect(Collectors.toList())); } @@ -357,6 +370,7 @@ public class XmlRecordFactory implements Serializable { r .getSource() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("source", c.getValue())) .collect(Collectors.toList())); } @@ -366,6 +380,7 @@ public class XmlRecordFactory implements Serializable { r .getFormat() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("format", c.getValue())) .collect(Collectors.toList())); } @@ -429,6 +444,7 @@ public class XmlRecordFactory implements Serializable { orp .getContactperson() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("contactperson", c.getValue())) .collect(Collectors.toList())); } @@ -439,6 +455,7 @@ public class XmlRecordFactory implements Serializable { orp .getContactgroup() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("contactgroup", c.getValue())) .collect(Collectors.toList())); } @@ -448,6 +465,7 @@ public class XmlRecordFactory implements Serializable { orp .getTool() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("tool", c.getValue())) .collect(Collectors.toList())); } @@ -461,6 +479,7 @@ public class XmlRecordFactory implements Serializable { s .getDocumentationUrl() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("documentationUrl", c.getValue())) .collect(Collectors.toList())); } @@ -470,6 +489,7 @@ public class XmlRecordFactory implements Serializable { s .getLicense() .stream() + .filter(Objects::nonNull) .map(l -> XmlSerializationUtils.mapStructuredProperty("license", l)) .collect(Collectors.toList())); } @@ -576,6 +596,7 @@ public class XmlRecordFactory implements Serializable { ds .getOdlanguages() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("odlanguages", c.getValue())) .collect(Collectors.toList())); } @@ -585,6 +606,7 @@ public class XmlRecordFactory implements Serializable { ds .getOdcontenttypes() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("odcontenttypes", c.getValue())) .collect(Collectors.toList())); } @@ -697,6 +719,7 @@ public class XmlRecordFactory implements Serializable { ds .getPolicies() .stream() + .filter(XmlRecordFactory::kvNotBlank) .map(kv -> XmlSerializationUtils.mapKeyValue("policies", kv)) .collect(Collectors.toList())); } @@ -709,6 +732,7 @@ public class XmlRecordFactory implements Serializable { ds .getSubjects() .stream() + .filter(Objects::nonNull) .map(sp -> XmlSerializationUtils.mapStructuredProperty("subjects", sp)) .collect(Collectors.toList())); } @@ -735,6 +759,7 @@ public class XmlRecordFactory implements Serializable { o .getAlternativeNames() .stream() + .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("alternativeNames", c.getValue())) .collect(Collectors.toList())); } @@ -862,6 +887,7 @@ public class XmlRecordFactory implements Serializable { p .getSubjects() .stream() + .filter(Objects::nonNull) .map(sp -> XmlSerializationUtils.mapStructuredProperty("subject", sp)) .collect(Collectors.toList())); } @@ -912,7 +938,12 @@ public class XmlRecordFactory implements Serializable { if (p.getFundingtree() != null) { metadata .addAll( - p.getFundingtree().stream().map(ft -> ft.getValue()).collect(Collectors.toList())); + p + .getFundingtree() + .stream() + .filter(Objects::nonNull) + .map(ft -> ft.getValue()) + .collect(Collectors.toList())); } break; @@ -923,6 +954,17 @@ public class XmlRecordFactory implements Serializable { return metadata; } + private String getAuthorPidType(String s) { + return XmlSerializationUtils + .escapeXml(s) + .replaceAll("\\W", "") + .replaceAll("\\d", ""); + } + + private static boolean kvNotBlank(KeyValue kv) { + return kv != null && StringUtils.isNotBlank(kv.getKey()) && StringUtils.isNotBlank(kv.getValue()); + } + private void mapDatasourceType(List metadata, final Qualifier dsType) { metadata.add(XmlSerializationUtils.mapQualifier("datasourcetype", dsType)); @@ -960,7 +1002,7 @@ public class XmlRecordFactory implements Serializable { .add( XmlSerializationUtils.asXmlElement("coderepositoryurl", re.getCodeRepositoryUrl())); } - if (re.getResulttype() != null & re.getResulttype().isBlank()) { + if (re.getResulttype() != null && re.getResulttype().isBlank()) { metadata.add(XmlSerializationUtils.mapQualifier("resulttype", re.getResulttype())); } if (re.getCollectedfrom() != null) { @@ -969,6 +1011,7 @@ public class XmlRecordFactory implements Serializable { re .getCollectedfrom() .stream() + .filter(XmlRecordFactory::kvNotBlank) .map(kv -> XmlSerializationUtils.mapKeyValue("collectedfrom", kv)) .collect(Collectors.toList())); } @@ -986,10 +1029,10 @@ public class XmlRecordFactory implements Serializable { if (isNotBlank(re.getOfficialname())) { metadata.add(XmlSerializationUtils.asXmlElement("officialname", re.getOfficialname())); } - if (re.getDatasourcetype() != null & !re.getDatasourcetype().isBlank()) { + if (re.getDatasourcetype() != null && !re.getDatasourcetype().isBlank()) { mapDatasourceType(metadata, re.getDatasourcetype()); } - if (re.getOpenairecompatibility() != null & !re.getOpenairecompatibility().isBlank()) { + if (re.getOpenairecompatibility() != null && !re.getOpenairecompatibility().isBlank()) { metadata .add( XmlSerializationUtils @@ -1006,7 +1049,7 @@ public class XmlRecordFactory implements Serializable { .add( XmlSerializationUtils.asXmlElement("legalshortname", re.getLegalshortname())); } - if (re.getCountry() != null & !re.getCountry().isBlank()) { + if (re.getCountry() != null && !re.getCountry().isBlank()) { metadata.add(XmlSerializationUtils.mapQualifier("country", re.getCountry())); } break; @@ -1020,10 +1063,10 @@ public class XmlRecordFactory implements Serializable { if (isNotBlank(re.getAcronym())) { metadata.add(XmlSerializationUtils.asXmlElement("acronym", re.getAcronym())); } - if (re.getContracttype() != null & !re.getContracttype().isBlank()) { + if (re.getContracttype() != null && !re.getContracttype().isBlank()) { metadata.add(XmlSerializationUtils.mapQualifier("contracttype", re.getContracttype())); } - if (re.getFundingtree() != null & contexts != null) { + if (re.getFundingtree() != null && contexts != null) { metadata .addAll( re @@ -1091,12 +1134,12 @@ public class XmlRecordFactory implements Serializable { .add( XmlSerializationUtils.mapQualifier("accessright", instance.getAccessright())); } - if (instance.getCollectedfrom() != null) { + if (instance.getCollectedfrom() != null && kvNotBlank(instance.getCollectedfrom())) { fields .add( XmlSerializationUtils.mapKeyValue("collectedfrom", instance.getCollectedfrom())); } - if (instance.getHostedby() != null) { + if (instance.getHostedby() != null && kvNotBlank(instance.getHostedby())) { fields.add(XmlSerializationUtils.mapKeyValue("hostedby", instance.getHostedby())); } if (instance.getDateofacceptance() != null diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json index bfb248d01..71b2becc4 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json @@ -21,6 +21,18 @@ "paramName": "rp", "paramLongName": "relPartitions", "paramDescription": "number or partitions for the relations Dataset", - "paramRequired": true + "paramRequired": false + }, + { + "paramName": "rf", + "paramLongName": "relationFilter", + "paramDescription": "filter applied reading relations (by relClass)", + "paramRequired": false + }, + { + "paramName": "mr", + "paramLongName": "maxRelations", + "paramDescription": "maximum number of relations allowed for a each entity", + "paramRequired": false } ] diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml index 298ac7589..6983ecf53 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml @@ -9,6 +9,30 @@ isLookupUrl URL for the isLookup service + + relPartitions + number or partitions for the relations Dataset + + + relationFilter + filter applied reading relations (by relClass) + + + maxRelations + maximum number of relations allowed for a each entity + + + otherDsTypeId + mapping used to populate datasourceTypeUi field + + + format + metadata format name (DMF|TMF) + + + batchSize + number of records to be included in each indexing request + sparkDriverMemoryForJoining @@ -56,6 +80,10 @@ spark2EventLogDir spark 2.* event log dir location + + sparkNetworkTimeout + configures spark.network.timeout + @@ -69,12 +97,16 @@ - + - + - ${wf:conf('reuseRecords') eq false} - ${wf:conf('reuseRecords') eq true} + ${wf:conf('resumeFrom') eq 'prepare_relations'} + ${wf:conf('resumeFrom') eq 'fork_join_related_entities'} + ${wf:conf('resumeFrom') eq 'join_all_entities'} + ${wf:conf('resumeFrom') eq 'adjancency_lists'} + ${wf:conf('resumeFrom') eq 'convert_to_xml'} + ${wf:conf('resumeFrom') eq 'to_solr_index'} @@ -309,7 +341,6 @@ - yarn @@ -419,4 +450,5 @@ + \ No newline at end of file diff --git a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/provision.xml b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/provision.xml index 7c918a0d7..487afee4f 100644 --- a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/provision.xml +++ b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/provision.xml @@ -521,6 +521,8 @@ { 'oozie.wf.application.path' : '/lib/dnet/oa/enrichment/country/oozie_app', + 'sparkExecutorCores' : '3', + 'sparkExecutorMemory' : '10G', 'workingDir' : '/tmp/beta_provision/working_dir/country', 'allowedtypes' : 'pubsrepository::institutional', 'whitelist' : '10|opendoar____::300891a62162b960cf02ce3827bb363c',