diff --git a/dhp-workflows/dhp-broker-events/pom.xml b/dhp-workflows/dhp-broker-events/pom.xml index 59ff05ed3..cc323d109 100644 --- a/dhp-workflows/dhp-broker-events/pom.xml +++ b/dhp-workflows/dhp-broker-events/pom.xml @@ -1,66 +1,72 @@ - - - dhp-workflows - eu.dnetlib.dhp - 1.2.2-SNAPSHOT - - 4.0.0 + + + dhp-workflows + eu.dnetlib.dhp + 1.2.2-SNAPSHOT + + 4.0.0 - dhp-broker-events + dhp-broker-events - + - - commons-io - commons-io - + + commons-io + commons-io + - - org.apache.spark - spark-core_2.11 - - - org.apache.spark - spark-sql_2.11 - - - org.apache.spark - spark-hive_2.11 - test - + + org.apache.spark + spark-core_2.11 + + + org.apache.spark + spark-sql_2.11 + + + org.apache.spark + spark-hive_2.11 + test + - - eu.dnetlib.dhp - dhp-common - ${project.version} - - - eu.dnetlib.dhp - dhp-schemas - ${project.version} - + + eu.dnetlib.dhp + dhp-common + ${project.version} + + + eu.dnetlib.dhp + dhp-schemas + ${project.version} + + + eu.dnetlib + dnet-pace-core + - - com.jayway.jsonpath - json-path - - - dom4j - dom4j - - - jaxen - jaxen - + + com.jayway.jsonpath + json-path + + + dom4j + dom4j + + + jaxen + jaxen + - - eu.dnetlib - dnet-openaire-broker-common - [2.0.1,3.0.0) - + + eu.dnetlib + dnet-openaire-broker-common + [2.0.1,3.0.0) + - + \ No newline at end of file diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java index 05fab47f0..44bc5cb6e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java @@ -63,6 +63,9 @@ import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import eu.dnetlib.pace.config.DedupConfig; import scala.Tuple2; public class GenerateEventsApplication { @@ -107,7 +110,10 @@ public class GenerateEventsApplication { private static final UpdateMatcher>, ?> enrichMissingDatasetIsSupplementedBy = new EnrichMissingDatasetIsSupplementedBy(); - public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + // Aggregators + private static final TypedColumn, ResultGroup> resultAggrTypedColumn = new ResultAggregator().toColumn(); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( @@ -128,8 +134,16 @@ public class GenerateEventsApplication { final String eventsPath = parser.get("eventsPath"); log.info("eventsPath: {}", eventsPath); + final String isLookupUrl = parser.get("isLookupUrl"); + log.info("isLookupUrl: {}", isLookupUrl); + + final String dedupConfigProfileId = parser.get("dedupConfProfile"); + log.info("dedupConfigProfileId: {}", dedupConfigProfileId); + final SparkConf conf = new SparkConf(); + final DedupConfig dedupConfig = loadDedupConfig(isLookupUrl, dedupConfigProfileId); + runWithSparkSession(conf, isSparkSessionManaged, spark -> { removeOutputDir(spark, eventsPath); @@ -137,10 +151,10 @@ public class GenerateEventsApplication { final Dataset all = spark.emptyDataset(Encoders.kryo(Event.class)); for (final Class r1 : BrokerConstants.RESULT_CLASSES) { - all.union(generateSimpleEvents(spark, graphPath, r1)); + all.union(generateSimpleEvents(spark, graphPath, r1, dedupConfig)); for (final Class r2 : BrokerConstants.RESULT_CLASSES) { - all.union(generateRelationEvents(spark, graphPath, r1, r2)); + all.union(generateRelationEvents(spark, graphPath, r1, r2, dedupConfig)); } } @@ -155,38 +169,37 @@ public class GenerateEventsApplication { private static Dataset generateSimpleEvents(final SparkSession spark, final String graphPath, - final Class resultClazz) { + final Class resultClazz, + final DedupConfig dedupConfig) { final Dataset results = readPath(spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), Result.class) .filter(r -> r.getDataInfo().getDeletedbyinference()); - final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class) + final Dataset mergedRels = readPath(spark, graphPath + "/relation", Relation.class) .filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)); - final TypedColumn, ResultGroup> aggr = new ResultAggregator().toColumn(); - - return results.joinWith(rels, results.col("id").equalTo(rels.col("source")), "inner") + return results.joinWith(mergedRels, results.col("id").equalTo(mergedRels.col("source")), "inner") .groupByKey((MapFunction, String>) t -> t._2.getTarget(), Encoders.STRING()) - .agg(aggr) + .agg(resultAggrTypedColumn) .map((MapFunction, ResultGroup>) t -> t._2, Encoders.kryo(ResultGroup.class)) .filter(ResultGroup::isValid) - .map((MapFunction) g -> GenerateEventsApplication.generateSimpleEvents(g), Encoders.kryo(EventGroup.class)) + .map((MapFunction) g -> GenerateEventsApplication.generateSimpleEvents(g, dedupConfig), Encoders.kryo(EventGroup.class)) .flatMap(group -> group.getData().iterator(), Encoders.kryo(Event.class)); } - private static EventGroup generateSimpleEvents(final ResultGroup results) { + private static EventGroup generateSimpleEvents(final ResultGroup results, final DedupConfig dedupConfig) { final List> list = new ArrayList<>(); for (final Result target : results.getData()) { - list.addAll(enrichMissingAbstract.searchUpdatesForRecord(target, results.getData())); - list.addAll(enrichMissingAuthorOrcid.searchUpdatesForRecord(target, results.getData())); - list.addAll(enrichMissingOpenAccess.searchUpdatesForRecord(target, results.getData())); - list.addAll(enrichMissingPid.searchUpdatesForRecord(target, results.getData())); - list.addAll(enrichMissingPublicationDate.searchUpdatesForRecord(target, results.getData())); - list.addAll(enrichMissingSubject.searchUpdatesForRecord(target, results.getData())); - list.addAll(enrichMoreOpenAccess.searchUpdatesForRecord(target, results.getData())); - list.addAll(enrichMorePid.searchUpdatesForRecord(target, results.getData())); - list.addAll(enrichMoreSubject.searchUpdatesForRecord(target, results.getData())); + list.addAll(enrichMissingAbstract.searchUpdatesForRecord(target, results.getData(), dedupConfig)); + list.addAll(enrichMissingAuthorOrcid.searchUpdatesForRecord(target, results.getData(), dedupConfig)); + list.addAll(enrichMissingOpenAccess.searchUpdatesForRecord(target, results.getData(), dedupConfig)); + list.addAll(enrichMissingPid.searchUpdatesForRecord(target, results.getData(), dedupConfig)); + list.addAll(enrichMissingPublicationDate.searchUpdatesForRecord(target, results.getData(), dedupConfig)); + list.addAll(enrichMissingSubject.searchUpdatesForRecord(target, results.getData(), dedupConfig)); + list.addAll(enrichMoreOpenAccess.searchUpdatesForRecord(target, results.getData(), dedupConfig)); + list.addAll(enrichMorePid.searchUpdatesForRecord(target, results.getData(), dedupConfig)); + list.addAll(enrichMoreSubject.searchUpdatesForRecord(target, results.getData(), dedupConfig)); } final EventGroup events = new EventGroup(); @@ -197,9 +210,10 @@ public class GenerateEventsApplication { private static Dataset generateRelationEvents(final SparkSession spark, final String graphPath, final Class sourceClass, - final Class targetClass) { + final Class targetClass, + final DedupConfig dedupConfig) { - final Dataset sources = readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass) + final Dataset sources = readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), Result.class) .filter(r -> r.getDataInfo().getDeletedbyinference()); final Dataset targets = readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), targetClass); @@ -210,6 +224,12 @@ public class GenerateEventsApplication { final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class) .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)); + final Dataset duplicates = sources.joinWith(mergedRels, sources.col("id").equalTo(rels.col("source")), "inner") + .groupByKey((MapFunction, String>) t -> t._2.getTarget(), Encoders.STRING()) + .agg(resultAggrTypedColumn) + .map((MapFunction, ResultGroup>) t -> t._2, Encoders.kryo(ResultGroup.class)) + .filter(ResultGroup::isValid); + if (targetClass == Project.class) { // TODO join using: generateProjectsEvents } else if (targetClass == Software.class) { @@ -223,29 +243,30 @@ public class GenerateEventsApplication { return null; } - private List generateProjectsEvents(final Collection>> childrenWithProjects) { + private List generateProjectsEvents(final Collection>> childrenWithProjects, final DedupConfig dedupConfig) { final List> list = new ArrayList<>(); for (final Pair> target : childrenWithProjects) { - list.addAll(enrichMissingProject.searchUpdatesForRecord(target, childrenWithProjects)); - list.addAll(enrichMoreProject.searchUpdatesForRecord(target, childrenWithProjects)); + list.addAll(enrichMissingProject.searchUpdatesForRecord(target, childrenWithProjects, dedupConfig)); + list.addAll(enrichMoreProject.searchUpdatesForRecord(target, childrenWithProjects, dedupConfig)); } return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList()); } - private List generateSoftwareEvents(final Collection>> childrenWithSoftwares) { + private List generateSoftwareEvents(final Collection>> childrenWithSoftwares, final DedupConfig dedupConfig) { final List> list = new ArrayList<>(); for (final Pair> target : childrenWithSoftwares) { - list.addAll(enrichMissingSoftware.searchUpdatesForRecord(target, childrenWithSoftwares)); - list.addAll(enrichMoreSoftware.searchUpdatesForRecord(target, childrenWithSoftwares)); + list.addAll(enrichMissingSoftware.searchUpdatesForRecord(target, childrenWithSoftwares, dedupConfig)); + list.addAll(enrichMoreSoftware.searchUpdatesForRecord(target, childrenWithSoftwares, dedupConfig)); } return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList()); } private List generatePublicationRelatedEvents(final String relType, - final Collection>>> childrenWithRels) { + final Collection>>> childrenWithRels, + final DedupConfig dedupConfig) { final List> list = new ArrayList<>(); @@ -258,15 +279,15 @@ public class GenerateEventsApplication { for (final Pair> target : cleanedChildrens) { if (relType.equals("isRelatedTo")) { - list.addAll(enrichMisissingPublicationIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens)); + list.addAll(enrichMisissingPublicationIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("references")) { - list.addAll(enrichMissingPublicationReferences.searchUpdatesForRecord(target, cleanedChildrens)); + list.addAll(enrichMissingPublicationReferences.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("isReferencedBy")) { - list.addAll(enrichMissingPublicationIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens)); + list.addAll(enrichMissingPublicationIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("isSupplementedTo")) { - list.addAll(enrichMissingPublicationIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens)); + list.addAll(enrichMissingPublicationIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("isSupplementedBy")) { - list.addAll(enrichMissingPublicationIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens)); + list.addAll(enrichMissingPublicationIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } } @@ -275,7 +296,8 @@ public class GenerateEventsApplication { } private List generateDatasetRelatedEvents(final String relType, - final Collection>>> childrenWithRels) { + final Collection>>> childrenWithRels, + final DedupConfig dedupConfig) { final List> list = new ArrayList<>(); @@ -288,15 +310,15 @@ public class GenerateEventsApplication { for (final Pair> target : cleanedChildrens) { if (relType.equals("isRelatedTo")) { - list.addAll(enrichMisissingDatasetIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens)); + list.addAll(enrichMisissingDatasetIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("references")) { - list.addAll(enrichMissingDatasetReferences.searchUpdatesForRecord(target, cleanedChildrens)); + list.addAll(enrichMissingDatasetReferences.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("isReferencedBy")) { - list.addAll(enrichMissingDatasetIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens)); + list.addAll(enrichMissingDatasetIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("isSupplementedTo")) { - list.addAll(enrichMissingDatasetIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens)); + list.addAll(enrichMissingDatasetIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("isSupplementedBy")) { - list.addAll(enrichMissingDatasetIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens)); + list.addAll(enrichMissingDatasetIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } } @@ -313,4 +335,20 @@ public class GenerateEventsApplication { .textFile(inputPath) .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); } + + private static DedupConfig loadDedupConfig(final String isLookupUrl, final String profId) throws Exception { + final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl); + + final String conf = isLookUpService.getResourceProfileByQuery(String + .format("for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", profId)); + + final DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class); + dedupConfig.getPace().initModel(); + dedupConfig.getPace().initTranslationMap(); + // dedupConfig.getWf().setConfigurationId("???"); + + return dedupConfig; + + } + } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java index 5bfe108a5..286b40ad5 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java @@ -12,6 +12,7 @@ import org.apache.commons.lang3.StringUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.pace.config.DedupConfig; public abstract class UpdateMatcher { @@ -21,16 +22,15 @@ public abstract class UpdateMatcher { this.multipleUpdate = multipleUpdate; } - public Collection> searchUpdatesForRecord(final K res, final Collection others) { + public Collection> searchUpdatesForRecord(final K res, final Collection others, final DedupConfig dedupConfig) { final Map> infoMap = new HashMap<>(); for (final K source : others) { if (source != res) { - for (final UpdateInfo info : findUpdates(source, res)) { + for (final UpdateInfo info : findUpdates(source, res, dedupConfig)) { final String s = DigestUtils.md5Hex(info.getHighlightValueAsString()); - if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) { - } else { + if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {} else { infoMap.put(s, info); } } @@ -51,11 +51,7 @@ public abstract class UpdateMatcher { } } - protected abstract List> findUpdates(K source, K target); - - protected abstract UpdateInfo generateUpdateInfo(final T highlightValue, - final K source, - final K target); + protected abstract List> findUpdates(K source, K target, DedupConfig dedupConfig); protected static boolean isMissing(final List> list) { return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0).getValue()); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java index 321fd4318..3cf7b18f9 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java @@ -13,6 +13,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public abstract class AbstractEnrichMissingDataset extends UpdateMatcher>, eu.dnetlib.broker.objects.Dataset> { @@ -27,7 +28,8 @@ public abstract class AbstractEnrichMissingDataset @Override protected final List> findUpdates( final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { final Set existingDatasets = target .getRight() @@ -40,21 +42,22 @@ public abstract class AbstractEnrichMissingDataset .stream() .filter(d -> !existingDatasets.contains(d.getId())) .map(ConversionUtils::oafDatasetToBrokerDataset) - .map(i -> generateUpdateInfo(i, source, target)) + .map(i -> generateUpdateInfo(i, source, target, dedupConfig)) .collect(Collectors.toList()); } - @Override protected final UpdateInfo generateUpdateInfo( final eu.dnetlib.broker.objects.Dataset highlightValue, final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( getTopic(), highlightValue, source.getLeft(), target.getLeft(), (p, rel) -> p.getDatasets().add(rel), - rel -> rel.getInstances().get(0).getUrl()); + rel -> rel.getInstances().get(0).getUrl(), + dedupConfig); } public Topic getTopic() { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java index ed5e71d52..22817a25d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java @@ -13,6 +13,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMissingProject extends UpdateMatcher>, eu.dnetlib.broker.objects.Project> { @@ -23,7 +24,8 @@ public class EnrichMissingProject @Override protected List> findUpdates(final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { if (source.getRight().isEmpty()) { return Arrays.asList(); @@ -32,21 +34,21 @@ public class EnrichMissingProject .getRight() .stream() .map(ConversionUtils::oafProjectToBrokerProject) - .map(p -> generateUpdateInfo(p, source, target)) + .map(p -> generateUpdateInfo(p, source, target, dedupConfig)) .collect(Collectors.toList()); } } - @Override public UpdateInfo generateUpdateInfo( final eu.dnetlib.broker.objects.Project highlightValue, final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MISSING_PROJECT, highlightValue, source.getLeft(), target.getLeft(), (p, prj) -> p.getProjects().add(prj), - prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode()); + prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode(), dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java index 753166a82..016bdd283 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java @@ -13,6 +13,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMoreProject extends UpdateMatcher>, eu.dnetlib.broker.objects.Project> { @@ -22,7 +23,8 @@ public class EnrichMoreProject extends UpdateMatcher> @Override protected List> findUpdates(final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { final Set existingProjects = source .getRight() @@ -35,20 +37,20 @@ public class EnrichMoreProject extends UpdateMatcher> .stream() .filter(p -> !existingProjects.contains(p.getId())) .map(ConversionUtils::oafProjectToBrokerProject) - .map(p -> generateUpdateInfo(p, source, target)) + .map(p -> generateUpdateInfo(p, source, target, dedupConfig)) .collect(Collectors.toList()); } - @Override public UpdateInfo generateUpdateInfo( final eu.dnetlib.broker.objects.Project highlightValue, final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MORE_PROJECT, highlightValue, source.getLeft(), target.getLeft(), (p, prj) -> p.getProjects().add(prj), - prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode()); + prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode(), dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java index 074b6043a..ec575e68d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java @@ -13,6 +13,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public abstract class AbstractEnrichMissingPublication extends UpdateMatcher>, eu.dnetlib.broker.objects.Publication> { @@ -27,7 +28,8 @@ public abstract class AbstractEnrichMissingPublication @Override protected final List> findUpdates( final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { final Set existingPublications = target .getRight() @@ -40,21 +42,21 @@ public abstract class AbstractEnrichMissingPublication .stream() .filter(d -> !existingPublications.contains(d.getId())) .map(ConversionUtils::oafResultToBrokerPublication) - .map(i -> generateUpdateInfo(i, source, target)) + .map(i -> generateUpdateInfo(i, source, target, dedupConfig)) .collect(Collectors.toList()); } - @Override protected final UpdateInfo generateUpdateInfo( final eu.dnetlib.broker.objects.Publication highlightValue, final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( getTopic(), highlightValue, source.getLeft(), target.getLeft(), (p, rel) -> p.getPublications().add(rel), - rel -> rel.getInstances().get(0).getUrl()); + rel -> rel.getInstances().get(0).getUrl(), dedupConfig); } public Topic getTopic() { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java index fd9091dc1..699d546ec 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java @@ -13,6 +13,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMissingSoftware extends UpdateMatcher>, eu.dnetlib.broker.objects.Software> { @@ -24,7 +25,8 @@ public class EnrichMissingSoftware @Override protected List> findUpdates( final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { if (source.getRight().isEmpty()) { return Arrays.asList(); @@ -33,21 +35,21 @@ public class EnrichMissingSoftware .getRight() .stream() .map(ConversionUtils::oafSoftwareToBrokerSoftware) - .map(p -> generateUpdateInfo(p, source, target)) + .map(p -> generateUpdateInfo(p, source, target, dedupConfig)) .collect(Collectors.toList()); } } - @Override public UpdateInfo generateUpdateInfo( final eu.dnetlib.broker.objects.Software highlightValue, final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MISSING_SOFTWARE, highlightValue, source.getLeft(), target.getLeft(), (p, s) -> p.getSoftwares().add(s), - s -> s.getName()); + s -> s.getName(), dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java index cb649dfff..45631df20 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java @@ -13,6 +13,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMoreSoftware extends UpdateMatcher>, eu.dnetlib.broker.objects.Software> { @@ -24,7 +25,8 @@ public class EnrichMoreSoftware @Override protected List> findUpdates( final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { final Set existingSoftwares = source .getRight() @@ -37,20 +39,20 @@ public class EnrichMoreSoftware .stream() .filter(p -> !existingSoftwares.contains(p.getId())) .map(ConversionUtils::oafSoftwareToBrokerSoftware) - .map(p -> generateUpdateInfo(p, source, target)) + .map(p -> generateUpdateInfo(p, source, target, dedupConfig)) .collect(Collectors.toList()); } - @Override public UpdateInfo generateUpdateInfo( final eu.dnetlib.broker.objects.Software highlightValue, final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MORE_SOFTWARE, highlightValue, source.getLeft(), target.getLeft(), (p, s) -> p.getSoftwares().add(s), - s -> s.getName()); + s -> s.getName(), dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java index a418b633e..c3b6bda66 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java @@ -9,6 +9,7 @@ import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMissingAbstract extends UpdateMatcher { @@ -17,22 +18,22 @@ public class EnrichMissingAbstract extends UpdateMatcher { } @Override - protected List> findUpdates(final Result source, final Result target) { + protected List> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { if (isMissing(target.getDescription()) && !isMissing(source.getDescription())) { - return Arrays.asList(generateUpdateInfo(source.getDescription().get(0).getValue(), source, target)); + return Arrays.asList(generateUpdateInfo(source.getDescription().get(0).getValue(), source, target, dedupConfig)); } return new ArrayList<>(); } - @Override public UpdateInfo generateUpdateInfo(final String highlightValue, final Result source, - final Result target) { + final Result target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MISSING_ABSTRACT, highlightValue, source, target, (p, s) -> p.getAbstracts().add(s), - s -> s); + s -> s, dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java index b5c2f7e72..89292d3da 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java @@ -10,6 +10,7 @@ import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMissingAuthorOrcid extends UpdateMatcher> { @@ -18,19 +19,21 @@ public class EnrichMissingAuthorOrcid extends UpdateMatcher>> findUpdates(final Result source, final Result target) { + protected List>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { + // TODO // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); return Arrays.asList(); } - @Override public UpdateInfo> generateUpdateInfo(final Pair highlightValue, final Result source, - final Result target) { + final Result target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MISSING_AUTHOR_ORCID, highlightValue, source, target, (p, pair) -> p.getCreators().add(pair.getLeft() + " - ORCID: " + pair.getRight()), - pair -> pair.getLeft() + "::" + pair.getRight()); + pair -> pair.getLeft() + "::" + pair.getRight(), + dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java index 30638cefb..7f5a595cc 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java @@ -12,6 +12,7 @@ import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMissingOpenAccess extends UpdateMatcher { @@ -20,7 +21,7 @@ public class EnrichMissingOpenAccess extends UpdateMatcher { } @Override - protected List> findUpdates(final Result source, final Result target) { + protected List> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { final long count = target .getInstance() .stream() @@ -28,9 +29,7 @@ public class EnrichMissingOpenAccess extends UpdateMatcher { .filter(right -> right.equals(BrokerConstants.OPEN_ACCESS)) .count(); - if (count > 0) { - return Arrays.asList(); - } + if (count > 0) { return Arrays.asList(); } return source .getInstance() @@ -38,19 +37,19 @@ public class EnrichMissingOpenAccess extends UpdateMatcher { .filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS)) .map(ConversionUtils::oafInstanceToBrokerInstances) .flatMap(List::stream) - .map(i -> generateUpdateInfo(i, source, target)) + .map(i -> generateUpdateInfo(i, source, target, dedupConfig)) .collect(Collectors.toList()); } - @Override public UpdateInfo generateUpdateInfo(final Instance highlightValue, final Result source, - final Result target) { + final Result target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MISSING_OA_VERSION, highlightValue, source, target, (p, i) -> p.getInstances().add(i), - Instance::getUrl); + Instance::getUrl, dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java index 522d46d40..6e106e669 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java @@ -11,6 +11,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMissingPid extends UpdateMatcher { @@ -19,28 +20,25 @@ public class EnrichMissingPid extends UpdateMatcher { } @Override - protected List> findUpdates(final Result source, final Result target) { + protected List> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { final long count = target.getPid().size(); - if (count > 0) { - return Arrays.asList(); - } + if (count > 0) { return Arrays.asList(); } return source .getPid() .stream() .map(ConversionUtils::oafPidToBrokerPid) - .map(i -> generateUpdateInfo(i, source, target)) + .map(i -> generateUpdateInfo(i, source, target, dedupConfig)) .collect(Collectors.toList()); } - @Override - public UpdateInfo generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) { + public UpdateInfo generateUpdateInfo(final Pid highlightValue, final Result source, final Result target, final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MISSING_PID, highlightValue, source, target, (p, pid) -> p.getPids().add(pid), - pid -> pid.getType() + "::" + pid.getValue()); + pid -> pid.getType() + "::" + pid.getValue(), dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java index 197ace97c..d2b28d65d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java @@ -9,6 +9,7 @@ import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMissingPublicationDate extends UpdateMatcher { @@ -17,22 +18,22 @@ public class EnrichMissingPublicationDate extends UpdateMatcher } @Override - protected List> findUpdates(final Result source, final Result target) { + protected List> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { if (isMissing(target.getDateofacceptance()) && !isMissing(source.getDateofacceptance())) { - return Arrays.asList(generateUpdateInfo(source.getDateofacceptance().getValue(), source, target)); + return Arrays.asList(generateUpdateInfo(source.getDateofacceptance().getValue(), source, target, dedupConfig)); } return new ArrayList<>(); } - @Override public UpdateInfo generateUpdateInfo(final String highlightValue, final Result source, - final Result target) { + final Result target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MISSING_PUBLICATION_DATE, highlightValue, source, target, (p, date) -> p.setPublicationdate(date), - s -> s); + s -> s, dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java index 290bad48b..de888ff87 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java @@ -14,6 +14,7 @@ import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMissingSubject extends UpdateMatcher> { @@ -22,7 +23,7 @@ public class EnrichMissingSubject extends UpdateMatcher>> findUpdates(final Result source, final Result target) { + protected List>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { final Set existingTypes = target .getSubject() .stream() @@ -35,20 +36,20 @@ public class EnrichMissingSubject extends UpdateMatcher !existingTypes.contains(pid.getQualifier().getClassid())) .map(ConversionUtils::oafSubjectToPair) - .map(i -> generateUpdateInfo(i, source, target)) + .map(i -> generateUpdateInfo(i, source, target, dedupConfig)) .collect(Collectors.toList()); } - @Override public UpdateInfo> generateUpdateInfo(final Pair highlightValue, final Result source, - final Result target) { + final Result target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.fromPath("ENRICH/MISSING/SUBJECT/" + highlightValue.getLeft()), highlightValue, source, target, (p, pair) -> p.getSubjects().add(pair.getRight()), - pair -> pair.getLeft() + "::" + pair.getRight()); + pair -> pair.getLeft() + "::" + pair.getRight(), dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java index f9c5f81da..021449797 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java @@ -12,6 +12,7 @@ import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMoreOpenAccess extends UpdateMatcher { @@ -20,7 +21,7 @@ public class EnrichMoreOpenAccess extends UpdateMatcher { } @Override - protected List> findUpdates(final Result source, final Result target) { + protected List> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { final Set urls = target .getInstance() .stream() @@ -36,19 +37,19 @@ public class EnrichMoreOpenAccess extends UpdateMatcher { .map(ConversionUtils::oafInstanceToBrokerInstances) .flatMap(List::stream) .filter(i -> !urls.contains(i.getUrl())) - .map(i -> generateUpdateInfo(i, source, target)) + .map(i -> generateUpdateInfo(i, source, target, dedupConfig)) .collect(Collectors.toList()); } - @Override public UpdateInfo generateUpdateInfo(final Instance highlightValue, final Result source, - final Result target) { + final Result target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MORE_OA_VERSION, highlightValue, source, target, (p, i) -> p.getInstances().add(i), - Instance::getUrl); + Instance::getUrl, dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java index 2ee327c83..c64ed20ea 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java @@ -11,6 +11,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMorePid extends UpdateMatcher { @@ -19,7 +20,7 @@ public class EnrichMorePid extends UpdateMatcher { } @Override - protected List> findUpdates(final Result source, final Result target) { + protected List> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { final Set existingPids = target .getPid() .stream() @@ -31,17 +32,16 @@ public class EnrichMorePid extends UpdateMatcher { .stream() .filter(pid -> !existingPids.contains(pid.getQualifier().getClassid() + "::" + pid.getValue())) .map(ConversionUtils::oafPidToBrokerPid) - .map(i -> generateUpdateInfo(i, source, target)) + .map(i -> generateUpdateInfo(i, source, target, dedupConfig)) .collect(Collectors.toList()); } - @Override - public UpdateInfo generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) { + public UpdateInfo generateUpdateInfo(final Pid highlightValue, final Result source, final Result target, final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MORE_PID, highlightValue, source, target, (p, pid) -> p.getPids().add(pid), - pid -> pid.getType() + "::" + pid.getValue()); + pid -> pid.getType() + "::" + pid.getValue(), dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java index b38445e88..3f7f5b3d5 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java @@ -12,6 +12,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMoreSubject extends UpdateMatcher> { @@ -20,7 +21,7 @@ public class EnrichMoreSubject extends UpdateMatcher>> findUpdates(final Result source, final Result target) { + protected List>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { final Set existingSubjects = target .getSubject() .stream() @@ -32,20 +33,20 @@ public class EnrichMoreSubject extends UpdateMatcher !existingSubjects.contains(pid.getQualifier().getClassid() + "::" + pid.getValue())) .map(ConversionUtils::oafSubjectToPair) - .map(i -> generateUpdateInfo(i, source, target)) + .map(i -> generateUpdateInfo(i, source, target, dedupConfig)) .collect(Collectors.toList()); } - @Override public UpdateInfo> generateUpdateInfo(final Pair highlightValue, final Result source, - final Result target) { + final Result target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.fromPath("ENRICH/MORE/SUBJECT/" + highlightValue.getLeft()), highlightValue, source, target, (p, pair) -> p.getSubjects().add(pair.getRight()), - pair -> pair.getLeft() + "::" + pair.getRight()); + pair -> pair.getLeft() + "::" + pair.getRight(), dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java index f4da59518..0665c69dd 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java @@ -15,6 +15,9 @@ public class BrokerConstants { public static final String OPEN_ACCESS = "OPEN"; public static final String IS_MERGED_IN_CLASS = "isMergedIn"; + public static final float MIN_TRUST = 0.25f; + public static final float MAX_TRUST = 1.00f; + public static final List> RESULT_CLASSES = Arrays .asList(Publication.class, Dataset.class, Software.class, OtherResearchProduct.class); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java new file mode 100644 index 000000000..6bf59c125 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java @@ -0,0 +1,15 @@ +package eu.dnetlib.dhp.broker.oa.util; + +public class TrustUtils { + + public static float rescale(final double score, final double threshold) { + if (score >= BrokerConstants.MAX_TRUST) { return BrokerConstants.MAX_TRUST; } + + final double val = (score - threshold) * (BrokerConstants.MAX_TRUST - BrokerConstants.MIN_TRUST) / (BrokerConstants.MAX_TRUST - threshold); + + if (val < BrokerConstants.MIN_TRUST) { return BrokerConstants.MIN_TRUST; } + if (val > BrokerConstants.MAX_TRUST) { return BrokerConstants.MAX_TRUST; } + + return (float) val; + } +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java index 04b426a1c..de6a71397 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java @@ -5,6 +5,11 @@ import java.util.List; import java.util.function.BiConsumer; import java.util.function.Function; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + import eu.dnetlib.broker.objects.OpenAireEventPayload; import eu.dnetlib.broker.objects.Provenance; import eu.dnetlib.broker.objects.Publication; @@ -12,6 +17,10 @@ import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.schema.oaf.Instance; import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.tree.support.TreeProcessor; +import eu.dnetlib.pace.util.MapDocumentUtil; public final class UpdateInfo { @@ -29,16 +38,19 @@ public final class UpdateInfo { private final float trust; + private static final Logger log = LoggerFactory.getLogger(UpdateInfo.class); + public UpdateInfo(final Topic topic, final T highlightValue, final Result source, final Result target, final BiConsumer compileHighlight, - final Function highlightToString) { + final Function highlightToString, + final DedupConfig dedupConfig) { this.topic = topic; this.highlightValue = highlightValue; this.source = source; this.target = target; this.compileHighlight = compileHighlight; this.highlightToString = highlightToString; - this.trust = calculateTrust(source, target); + this.trust = calculateTrust(dedupConfig, source, target); } public T getHighlightValue() { @@ -53,9 +65,20 @@ public final class UpdateInfo { return target; } - private float calculateTrust(final Result source, final Result target) { - // TODO - return 0.9f; + private float calculateTrust(final DedupConfig dedupConfig, final Result r1, final Result r2) { + try { + final ObjectMapper objectMapper = new ObjectMapper(); + final MapDocument doc1 = MapDocumentUtil.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r1)); + final MapDocument doc2 = MapDocumentUtil.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r2)); + + final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2); + final double threshold = dedupConfig.getWf().getThreshold(); + + return TrustUtils.rescale(score, threshold); + } catch (final Exception e) { + log.error("Error computing score between results", e); + return BrokerConstants.MIN_TRUST; + } } protected Topic getTopic() { @@ -95,8 +118,7 @@ public final class UpdateInfo { .map(Instance::getUrl) .flatMap(List::stream) .findFirst() - .orElse(null); - ; + .orElse(null);; final Provenance provenance = new Provenance().setId(provId).setRepositoryName(provRepo).setUrl(provUrl); diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java new file mode 100644 index 000000000..58f391c24 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java @@ -0,0 +1,72 @@ +package eu.dnetlib.dhp.broker.oa.util; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.junit.jupiter.api.Test; + +public class TrustUtilsTest { + + private static final double THRESHOLD = 0.95; + + @Test + public void rescaleTest_1() { + verifyValue(-0.3, BrokerConstants.MIN_TRUST); + } + + @Test + public void rescaleTest_2() { + verifyValue(0.0, BrokerConstants.MIN_TRUST); + } + + @Test + public void rescaleTest_3() { + verifyValue(0.5, BrokerConstants.MIN_TRUST); + } + + @Test + public void rescaleTest_4() { + verifyValue(0.95, BrokerConstants.MIN_TRUST); + } + + @Test + public void rescaleTest_5() { + verifyValue(0.96, BrokerConstants.MIN_TRUST); + } + + @Test + public void rescaleTest_6() { + verifyValue(0.97, 0.3f); + } + + @Test + public void rescaleTest_7() { + verifyValue(0.98, 0.45f); + } + + @Test + public void rescaleTest_8() { + verifyValue(0.99, 0.6f); + } + + @Test + public void rescaleTest_9() { + verifyValue(1.00, BrokerConstants.MAX_TRUST); + } + + @Test + public void rescaleTest_10() { + verifyValue(1.01, BrokerConstants.MAX_TRUST); + } + + @Test + public void rescaleTest_11() { + verifyValue(2.00, BrokerConstants.MAX_TRUST); + } + + private void verifyValue(final double originalScore, final float expectedTrust) { + final float trust = TrustUtils.rescale(originalScore, THRESHOLD); + System.out.println(trust); + assertTrue(Math.abs(trust - expectedTrust) < 0.01); + } + +}