enrichment steps #38

Merged
claudio.atzori merged 334 commits from miriam.baglioni/dnet-hadoop:master into enrichment_wfs 2020-08-11 16:40:26 +02:00
22 changed files with 360 additions and 190 deletions
Showing only changes of commit baaa55f4a3 - Show all commits

View File

@ -1,5 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
@ -40,6 +42,10 @@
<artifactId>dhp-schemas</artifactId> <artifactId>dhp-schemas</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-pace-core</artifactId>
</dependency>
<dependency> <dependency>
<groupId>com.jayway.jsonpath</groupId> <groupId>com.jayway.jsonpath</groupId>

View File

@ -63,6 +63,9 @@ import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.config.DedupConfig;
import scala.Tuple2; import scala.Tuple2;
public class GenerateEventsApplication { public class GenerateEventsApplication {
@ -107,7 +110,10 @@ public class GenerateEventsApplication {
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedBy = private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedBy =
new EnrichMissingDatasetIsSupplementedBy(); new EnrichMissingDatasetIsSupplementedBy();
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); // Aggregators
private static final TypedColumn<Tuple2<Result, Relation>, ResultGroup> resultAggrTypedColumn = new ResultAggregator().toColumn();
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(final String[] args) throws Exception { public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser( final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -128,8 +134,16 @@ public class GenerateEventsApplication {
final String eventsPath = parser.get("eventsPath"); final String eventsPath = parser.get("eventsPath");
log.info("eventsPath: {}", eventsPath); log.info("eventsPath: {}", eventsPath);
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
final String dedupConfigProfileId = parser.get("dedupConfProfile");
log.info("dedupConfigProfileId: {}", dedupConfigProfileId);
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
final DedupConfig dedupConfig = loadDedupConfig(isLookupUrl, dedupConfigProfileId);
runWithSparkSession(conf, isSparkSessionManaged, spark -> { runWithSparkSession(conf, isSparkSessionManaged, spark -> {
removeOutputDir(spark, eventsPath); removeOutputDir(spark, eventsPath);
@ -137,10 +151,10 @@ public class GenerateEventsApplication {
final Dataset<Event> all = spark.emptyDataset(Encoders.kryo(Event.class)); final Dataset<Event> all = spark.emptyDataset(Encoders.kryo(Event.class));
for (final Class<? extends Result> r1 : BrokerConstants.RESULT_CLASSES) { for (final Class<? extends Result> r1 : BrokerConstants.RESULT_CLASSES) {
all.union(generateSimpleEvents(spark, graphPath, r1)); all.union(generateSimpleEvents(spark, graphPath, r1, dedupConfig));
for (final Class<? extends Result> r2 : BrokerConstants.RESULT_CLASSES) { for (final Class<? extends Result> r2 : BrokerConstants.RESULT_CLASSES) {
all.union(generateRelationEvents(spark, graphPath, r1, r2)); all.union(generateRelationEvents(spark, graphPath, r1, r2, dedupConfig));
} }
} }
@ -155,38 +169,37 @@ public class GenerateEventsApplication {
private static <R extends Result> Dataset<Event> generateSimpleEvents(final SparkSession spark, private static <R extends Result> Dataset<Event> generateSimpleEvents(final SparkSession spark,
final String graphPath, final String graphPath,
final Class<R> resultClazz) { final Class<R> resultClazz,
final DedupConfig dedupConfig) {
final Dataset<Result> results = readPath(spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), Result.class) final Dataset<Result> results = readPath(spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), Result.class)
.filter(r -> r.getDataInfo().getDeletedbyinference()); .filter(r -> r.getDataInfo().getDeletedbyinference());
final Dataset<Relation> rels = readPath(spark, graphPath + "/relation", Relation.class) final Dataset<Relation> mergedRels = readPath(spark, graphPath + "/relation", Relation.class)
.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)); .filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
final TypedColumn<Tuple2<Result, Relation>, ResultGroup> aggr = new ResultAggregator().toColumn(); return results.joinWith(mergedRels, results.col("id").equalTo(mergedRels.col("source")), "inner")
return results.joinWith(rels, results.col("id").equalTo(rels.col("source")), "inner")
.groupByKey((MapFunction<Tuple2<Result, Relation>, String>) t -> t._2.getTarget(), Encoders.STRING()) .groupByKey((MapFunction<Tuple2<Result, Relation>, String>) t -> t._2.getTarget(), Encoders.STRING())
.agg(aggr) .agg(resultAggrTypedColumn)
.map((MapFunction<Tuple2<String, ResultGroup>, ResultGroup>) t -> t._2, Encoders.kryo(ResultGroup.class)) .map((MapFunction<Tuple2<String, ResultGroup>, ResultGroup>) t -> t._2, Encoders.kryo(ResultGroup.class))
.filter(ResultGroup::isValid) .filter(ResultGroup::isValid)
.map((MapFunction<ResultGroup, EventGroup>) g -> GenerateEventsApplication.generateSimpleEvents(g), Encoders.kryo(EventGroup.class)) .map((MapFunction<ResultGroup, EventGroup>) g -> GenerateEventsApplication.generateSimpleEvents(g, dedupConfig), Encoders.kryo(EventGroup.class))
.flatMap(group -> group.getData().iterator(), Encoders.kryo(Event.class)); .flatMap(group -> group.getData().iterator(), Encoders.kryo(Event.class));
} }
private static EventGroup generateSimpleEvents(final ResultGroup results) { private static EventGroup generateSimpleEvents(final ResultGroup results, final DedupConfig dedupConfig) {
final List<UpdateInfo<?>> list = new ArrayList<>(); final List<UpdateInfo<?>> list = new ArrayList<>();
for (final Result target : results.getData()) { for (final Result target : results.getData()) {
list.addAll(enrichMissingAbstract.searchUpdatesForRecord(target, results.getData())); list.addAll(enrichMissingAbstract.searchUpdatesForRecord(target, results.getData(), dedupConfig));
list.addAll(enrichMissingAuthorOrcid.searchUpdatesForRecord(target, results.getData())); list.addAll(enrichMissingAuthorOrcid.searchUpdatesForRecord(target, results.getData(), dedupConfig));
list.addAll(enrichMissingOpenAccess.searchUpdatesForRecord(target, results.getData())); list.addAll(enrichMissingOpenAccess.searchUpdatesForRecord(target, results.getData(), dedupConfig));
list.addAll(enrichMissingPid.searchUpdatesForRecord(target, results.getData())); list.addAll(enrichMissingPid.searchUpdatesForRecord(target, results.getData(), dedupConfig));
list.addAll(enrichMissingPublicationDate.searchUpdatesForRecord(target, results.getData())); list.addAll(enrichMissingPublicationDate.searchUpdatesForRecord(target, results.getData(), dedupConfig));
list.addAll(enrichMissingSubject.searchUpdatesForRecord(target, results.getData())); list.addAll(enrichMissingSubject.searchUpdatesForRecord(target, results.getData(), dedupConfig));
list.addAll(enrichMoreOpenAccess.searchUpdatesForRecord(target, results.getData())); list.addAll(enrichMoreOpenAccess.searchUpdatesForRecord(target, results.getData(), dedupConfig));
list.addAll(enrichMorePid.searchUpdatesForRecord(target, results.getData())); list.addAll(enrichMorePid.searchUpdatesForRecord(target, results.getData(), dedupConfig));
list.addAll(enrichMoreSubject.searchUpdatesForRecord(target, results.getData())); list.addAll(enrichMoreSubject.searchUpdatesForRecord(target, results.getData(), dedupConfig));
} }
final EventGroup events = new EventGroup(); final EventGroup events = new EventGroup();
@ -197,9 +210,10 @@ public class GenerateEventsApplication {
private static <SRC extends Result, TRG extends OafEntity> Dataset<Event> generateRelationEvents(final SparkSession spark, private static <SRC extends Result, TRG extends OafEntity> Dataset<Event> generateRelationEvents(final SparkSession spark,
final String graphPath, final String graphPath,
final Class<SRC> sourceClass, final Class<SRC> sourceClass,
final Class<TRG> targetClass) { final Class<TRG> targetClass,
final DedupConfig dedupConfig) {
final Dataset<SRC> sources = readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass) final Dataset<Result> sources = readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), Result.class)
.filter(r -> r.getDataInfo().getDeletedbyinference()); .filter(r -> r.getDataInfo().getDeletedbyinference());
final Dataset<TRG> targets = readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), targetClass); final Dataset<TRG> targets = readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), targetClass);
@ -210,6 +224,12 @@ public class GenerateEventsApplication {
final Dataset<Relation> rels = readPath(spark, graphPath + "/relation", Relation.class) final Dataset<Relation> rels = readPath(spark, graphPath + "/relation", Relation.class)
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)); .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
final Dataset<ResultGroup> duplicates = sources.joinWith(mergedRels, sources.col("id").equalTo(rels.col("source")), "inner")
.groupByKey((MapFunction<Tuple2<Result, Relation>, String>) t -> t._2.getTarget(), Encoders.STRING())
.agg(resultAggrTypedColumn)
.map((MapFunction<Tuple2<String, ResultGroup>, ResultGroup>) t -> t._2, Encoders.kryo(ResultGroup.class))
.filter(ResultGroup::isValid);
if (targetClass == Project.class) { if (targetClass == Project.class) {
// TODO join using: generateProjectsEvents // TODO join using: generateProjectsEvents
} else if (targetClass == Software.class) { } else if (targetClass == Software.class) {
@ -223,29 +243,30 @@ public class GenerateEventsApplication {
return null; return null;
} }
private List<Event> generateProjectsEvents(final Collection<Pair<Result, List<Project>>> childrenWithProjects) { private List<Event> generateProjectsEvents(final Collection<Pair<Result, List<Project>>> childrenWithProjects, final DedupConfig dedupConfig) {
final List<UpdateInfo<?>> list = new ArrayList<>(); final List<UpdateInfo<?>> list = new ArrayList<>();
for (final Pair<Result, List<Project>> target : childrenWithProjects) { for (final Pair<Result, List<Project>> target : childrenWithProjects) {
list.addAll(enrichMissingProject.searchUpdatesForRecord(target, childrenWithProjects)); list.addAll(enrichMissingProject.searchUpdatesForRecord(target, childrenWithProjects, dedupConfig));
list.addAll(enrichMoreProject.searchUpdatesForRecord(target, childrenWithProjects)); list.addAll(enrichMoreProject.searchUpdatesForRecord(target, childrenWithProjects, dedupConfig));
} }
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList()); return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
} }
private List<Event> generateSoftwareEvents(final Collection<Pair<Result, List<Software>>> childrenWithSoftwares) { private List<Event> generateSoftwareEvents(final Collection<Pair<Result, List<Software>>> childrenWithSoftwares, final DedupConfig dedupConfig) {
final List<UpdateInfo<?>> list = new ArrayList<>(); final List<UpdateInfo<?>> list = new ArrayList<>();
for (final Pair<Result, List<Software>> target : childrenWithSoftwares) { for (final Pair<Result, List<Software>> target : childrenWithSoftwares) {
list.addAll(enrichMissingSoftware.searchUpdatesForRecord(target, childrenWithSoftwares)); list.addAll(enrichMissingSoftware.searchUpdatesForRecord(target, childrenWithSoftwares, dedupConfig));
list.addAll(enrichMoreSoftware.searchUpdatesForRecord(target, childrenWithSoftwares)); list.addAll(enrichMoreSoftware.searchUpdatesForRecord(target, childrenWithSoftwares, dedupConfig));
} }
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList()); return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
} }
private List<Event> generatePublicationRelatedEvents(final String relType, private List<Event> generatePublicationRelatedEvents(final String relType,
final Collection<Pair<Result, Map<String, List<Publication>>>> childrenWithRels) { final Collection<Pair<Result, Map<String, List<Publication>>>> childrenWithRels,
final DedupConfig dedupConfig) {
final List<UpdateInfo<?>> list = new ArrayList<>(); final List<UpdateInfo<?>> list = new ArrayList<>();
@ -258,15 +279,15 @@ public class GenerateEventsApplication {
for (final Pair<Result, List<Publication>> target : cleanedChildrens) { for (final Pair<Result, List<Publication>> target : cleanedChildrens) {
if (relType.equals("isRelatedTo")) { if (relType.equals("isRelatedTo")) {
list.addAll(enrichMisissingPublicationIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens)); list.addAll(enrichMisissingPublicationIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
} else if (relType.equals("references")) { } else if (relType.equals("references")) {
list.addAll(enrichMissingPublicationReferences.searchUpdatesForRecord(target, cleanedChildrens)); list.addAll(enrichMissingPublicationReferences.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
} else if (relType.equals("isReferencedBy")) { } else if (relType.equals("isReferencedBy")) {
list.addAll(enrichMissingPublicationIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens)); list.addAll(enrichMissingPublicationIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
} else if (relType.equals("isSupplementedTo")) { } else if (relType.equals("isSupplementedTo")) {
list.addAll(enrichMissingPublicationIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens)); list.addAll(enrichMissingPublicationIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
} else if (relType.equals("isSupplementedBy")) { } else if (relType.equals("isSupplementedBy")) {
list.addAll(enrichMissingPublicationIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens)); list.addAll(enrichMissingPublicationIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
} }
} }
@ -275,7 +296,8 @@ public class GenerateEventsApplication {
} }
private List<Event> generateDatasetRelatedEvents(final String relType, private List<Event> generateDatasetRelatedEvents(final String relType,
final Collection<Pair<Result, Map<String, List<eu.dnetlib.dhp.schema.oaf.Dataset>>>> childrenWithRels) { final Collection<Pair<Result, Map<String, List<eu.dnetlib.dhp.schema.oaf.Dataset>>>> childrenWithRels,
final DedupConfig dedupConfig) {
final List<UpdateInfo<?>> list = new ArrayList<>(); final List<UpdateInfo<?>> list = new ArrayList<>();
@ -288,15 +310,15 @@ public class GenerateEventsApplication {
for (final Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>> target : cleanedChildrens) { for (final Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>> target : cleanedChildrens) {
if (relType.equals("isRelatedTo")) { if (relType.equals("isRelatedTo")) {
list.addAll(enrichMisissingDatasetIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens)); list.addAll(enrichMisissingDatasetIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
} else if (relType.equals("references")) { } else if (relType.equals("references")) {
list.addAll(enrichMissingDatasetReferences.searchUpdatesForRecord(target, cleanedChildrens)); list.addAll(enrichMissingDatasetReferences.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
} else if (relType.equals("isReferencedBy")) { } else if (relType.equals("isReferencedBy")) {
list.addAll(enrichMissingDatasetIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens)); list.addAll(enrichMissingDatasetIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
} else if (relType.equals("isSupplementedTo")) { } else if (relType.equals("isSupplementedTo")) {
list.addAll(enrichMissingDatasetIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens)); list.addAll(enrichMissingDatasetIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
} else if (relType.equals("isSupplementedBy")) { } else if (relType.equals("isSupplementedBy")) {
list.addAll(enrichMissingDatasetIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens)); list.addAll(enrichMissingDatasetIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
} }
} }
@ -313,4 +335,20 @@ public class GenerateEventsApplication {
.textFile(inputPath) .textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); .map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
} }
private static DedupConfig loadDedupConfig(final String isLookupUrl, final String profId) throws Exception {
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl);
final String conf = isLookUpService.getResourceProfileByQuery(String
.format("for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", profId));
final DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class);
dedupConfig.getPace().initModel();
dedupConfig.getPace().initTranslationMap();
// dedupConfig.getWf().setConfigurationId("???");
return dedupConfig;
}
} }

View File

@ -12,6 +12,7 @@ import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Field; import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.pace.config.DedupConfig;
public abstract class UpdateMatcher<K, T> { public abstract class UpdateMatcher<K, T> {
@ -21,16 +22,15 @@ public abstract class UpdateMatcher<K, T> {
this.multipleUpdate = multipleUpdate; this.multipleUpdate = multipleUpdate;
} }
public Collection<UpdateInfo<T>> searchUpdatesForRecord(final K res, final Collection<K> others) { public Collection<UpdateInfo<T>> searchUpdatesForRecord(final K res, final Collection<K> others, final DedupConfig dedupConfig) {
final Map<String, UpdateInfo<T>> infoMap = new HashMap<>(); final Map<String, UpdateInfo<T>> infoMap = new HashMap<>();
for (final K source : others) { for (final K source : others) {
if (source != res) { if (source != res) {
for (final UpdateInfo<T> info : findUpdates(source, res)) { for (final UpdateInfo<T> info : findUpdates(source, res, dedupConfig)) {
final String s = DigestUtils.md5Hex(info.getHighlightValueAsString()); final String s = DigestUtils.md5Hex(info.getHighlightValueAsString());
if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) { if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {} else {
} else {
infoMap.put(s, info); infoMap.put(s, info);
} }
} }
@ -51,11 +51,7 @@ public abstract class UpdateMatcher<K, T> {
} }
} }
protected abstract List<UpdateInfo<T>> findUpdates(K source, K target); protected abstract List<UpdateInfo<T>> findUpdates(K source, K target, DedupConfig dedupConfig);
protected abstract UpdateInfo<T> generateUpdateInfo(final T highlightValue,
final K source,
final K target);
protected static boolean isMissing(final List<Field<String>> list) { protected static boolean isMissing(final List<Field<String>> list) {
return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0).getValue()); return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0).getValue());

View File

@ -13,6 +13,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.pace.config.DedupConfig;
public abstract class AbstractEnrichMissingDataset public abstract class AbstractEnrichMissingDataset
extends UpdateMatcher<Pair<Result, List<Dataset>>, eu.dnetlib.broker.objects.Dataset> { extends UpdateMatcher<Pair<Result, List<Dataset>>, eu.dnetlib.broker.objects.Dataset> {
@ -27,7 +28,8 @@ public abstract class AbstractEnrichMissingDataset
@Override @Override
protected final List<UpdateInfo<eu.dnetlib.broker.objects.Dataset>> findUpdates( protected final List<UpdateInfo<eu.dnetlib.broker.objects.Dataset>> findUpdates(
final Pair<Result, List<Dataset>> source, final Pair<Result, List<Dataset>> source,
final Pair<Result, List<Dataset>> target) { final Pair<Result, List<Dataset>> target,
final DedupConfig dedupConfig) {
final Set<String> existingDatasets = target final Set<String> existingDatasets = target
.getRight() .getRight()
@ -40,21 +42,22 @@ public abstract class AbstractEnrichMissingDataset
.stream() .stream()
.filter(d -> !existingDatasets.contains(d.getId())) .filter(d -> !existingDatasets.contains(d.getId()))
.map(ConversionUtils::oafDatasetToBrokerDataset) .map(ConversionUtils::oafDatasetToBrokerDataset)
.map(i -> generateUpdateInfo(i, source, target)) .map(i -> generateUpdateInfo(i, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
@Override
protected final UpdateInfo<eu.dnetlib.broker.objects.Dataset> generateUpdateInfo( protected final UpdateInfo<eu.dnetlib.broker.objects.Dataset> generateUpdateInfo(
final eu.dnetlib.broker.objects.Dataset highlightValue, final eu.dnetlib.broker.objects.Dataset highlightValue,
final Pair<Result, List<Dataset>> source, final Pair<Result, List<Dataset>> source,
final Pair<Result, List<Dataset>> target) { final Pair<Result, List<Dataset>> target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>( return new UpdateInfo<>(
getTopic(), getTopic(),
highlightValue, source.getLeft(), target.getLeft(), highlightValue, source.getLeft(), target.getLeft(),
(p, rel) -> p.getDatasets().add(rel), (p, rel) -> p.getDatasets().add(rel),
rel -> rel.getInstances().get(0).getUrl()); rel -> rel.getInstances().get(0).getUrl(),
dedupConfig);
} }
public Topic getTopic() { public Topic getTopic() {

View File

@ -13,6 +13,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMissingProject public class EnrichMissingProject
extends UpdateMatcher<Pair<Result, List<Project>>, eu.dnetlib.broker.objects.Project> { extends UpdateMatcher<Pair<Result, List<Project>>, eu.dnetlib.broker.objects.Project> {
@ -23,7 +24,8 @@ public class EnrichMissingProject
@Override @Override
protected List<UpdateInfo<eu.dnetlib.broker.objects.Project>> findUpdates(final Pair<Result, List<Project>> source, protected List<UpdateInfo<eu.dnetlib.broker.objects.Project>> findUpdates(final Pair<Result, List<Project>> source,
final Pair<Result, List<Project>> target) { final Pair<Result, List<Project>> target,
final DedupConfig dedupConfig) {
if (source.getRight().isEmpty()) { if (source.getRight().isEmpty()) {
return Arrays.asList(); return Arrays.asList();
@ -32,21 +34,21 @@ public class EnrichMissingProject
.getRight() .getRight()
.stream() .stream()
.map(ConversionUtils::oafProjectToBrokerProject) .map(ConversionUtils::oafProjectToBrokerProject)
.map(p -> generateUpdateInfo(p, source, target)) .map(p -> generateUpdateInfo(p, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
} }
@Override
public UpdateInfo<eu.dnetlib.broker.objects.Project> generateUpdateInfo( public UpdateInfo<eu.dnetlib.broker.objects.Project> generateUpdateInfo(
final eu.dnetlib.broker.objects.Project highlightValue, final eu.dnetlib.broker.objects.Project highlightValue,
final Pair<Result, List<Project>> source, final Pair<Result, List<Project>> source,
final Pair<Result, List<Project>> target) { final Pair<Result, List<Project>> target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>( return new UpdateInfo<>(
Topic.ENRICH_MISSING_PROJECT, Topic.ENRICH_MISSING_PROJECT,
highlightValue, source.getLeft(), target.getLeft(), highlightValue, source.getLeft(), target.getLeft(),
(p, prj) -> p.getProjects().add(prj), (p, prj) -> p.getProjects().add(prj),
prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode()); prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode(), dedupConfig);
} }
} }

View File

@ -13,6 +13,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMoreProject extends UpdateMatcher<Pair<Result, List<Project>>, eu.dnetlib.broker.objects.Project> { public class EnrichMoreProject extends UpdateMatcher<Pair<Result, List<Project>>, eu.dnetlib.broker.objects.Project> {
@ -22,7 +23,8 @@ public class EnrichMoreProject extends UpdateMatcher<Pair<Result, List<Project>>
@Override @Override
protected List<UpdateInfo<eu.dnetlib.broker.objects.Project>> findUpdates(final Pair<Result, List<Project>> source, protected List<UpdateInfo<eu.dnetlib.broker.objects.Project>> findUpdates(final Pair<Result, List<Project>> source,
final Pair<Result, List<Project>> target) { final Pair<Result, List<Project>> target,
final DedupConfig dedupConfig) {
final Set<String> existingProjects = source final Set<String> existingProjects = source
.getRight() .getRight()
@ -35,20 +37,20 @@ public class EnrichMoreProject extends UpdateMatcher<Pair<Result, List<Project>>
.stream() .stream()
.filter(p -> !existingProjects.contains(p.getId())) .filter(p -> !existingProjects.contains(p.getId()))
.map(ConversionUtils::oafProjectToBrokerProject) .map(ConversionUtils::oafProjectToBrokerProject)
.map(p -> generateUpdateInfo(p, source, target)) .map(p -> generateUpdateInfo(p, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
@Override
public UpdateInfo<eu.dnetlib.broker.objects.Project> generateUpdateInfo( public UpdateInfo<eu.dnetlib.broker.objects.Project> generateUpdateInfo(
final eu.dnetlib.broker.objects.Project highlightValue, final eu.dnetlib.broker.objects.Project highlightValue,
final Pair<Result, List<Project>> source, final Pair<Result, List<Project>> source,
final Pair<Result, List<Project>> target) { final Pair<Result, List<Project>> target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>( return new UpdateInfo<>(
Topic.ENRICH_MORE_PROJECT, Topic.ENRICH_MORE_PROJECT,
highlightValue, source.getLeft(), target.getLeft(), highlightValue, source.getLeft(), target.getLeft(),
(p, prj) -> p.getProjects().add(prj), (p, prj) -> p.getProjects().add(prj),
prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode()); prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode(), dedupConfig);
} }
} }

View File

@ -13,6 +13,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.pace.config.DedupConfig;
public abstract class AbstractEnrichMissingPublication public abstract class AbstractEnrichMissingPublication
extends UpdateMatcher<Pair<Result, List<Publication>>, eu.dnetlib.broker.objects.Publication> { extends UpdateMatcher<Pair<Result, List<Publication>>, eu.dnetlib.broker.objects.Publication> {
@ -27,7 +28,8 @@ public abstract class AbstractEnrichMissingPublication
@Override @Override
protected final List<UpdateInfo<eu.dnetlib.broker.objects.Publication>> findUpdates( protected final List<UpdateInfo<eu.dnetlib.broker.objects.Publication>> findUpdates(
final Pair<Result, List<Publication>> source, final Pair<Result, List<Publication>> source,
final Pair<Result, List<Publication>> target) { final Pair<Result, List<Publication>> target,
final DedupConfig dedupConfig) {
final Set<String> existingPublications = target final Set<String> existingPublications = target
.getRight() .getRight()
@ -40,21 +42,21 @@ public abstract class AbstractEnrichMissingPublication
.stream() .stream()
.filter(d -> !existingPublications.contains(d.getId())) .filter(d -> !existingPublications.contains(d.getId()))
.map(ConversionUtils::oafResultToBrokerPublication) .map(ConversionUtils::oafResultToBrokerPublication)
.map(i -> generateUpdateInfo(i, source, target)) .map(i -> generateUpdateInfo(i, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
@Override
protected final UpdateInfo<eu.dnetlib.broker.objects.Publication> generateUpdateInfo( protected final UpdateInfo<eu.dnetlib.broker.objects.Publication> generateUpdateInfo(
final eu.dnetlib.broker.objects.Publication highlightValue, final eu.dnetlib.broker.objects.Publication highlightValue,
final Pair<Result, List<Publication>> source, final Pair<Result, List<Publication>> source,
final Pair<Result, List<Publication>> target) { final Pair<Result, List<Publication>> target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>( return new UpdateInfo<>(
getTopic(), getTopic(),
highlightValue, source.getLeft(), target.getLeft(), highlightValue, source.getLeft(), target.getLeft(),
(p, rel) -> p.getPublications().add(rel), (p, rel) -> p.getPublications().add(rel),
rel -> rel.getInstances().get(0).getUrl()); rel -> rel.getInstances().get(0).getUrl(), dedupConfig);
} }
public Topic getTopic() { public Topic getTopic() {

View File

@ -13,6 +13,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMissingSoftware public class EnrichMissingSoftware
extends UpdateMatcher<Pair<Result, List<Software>>, eu.dnetlib.broker.objects.Software> { extends UpdateMatcher<Pair<Result, List<Software>>, eu.dnetlib.broker.objects.Software> {
@ -24,7 +25,8 @@ public class EnrichMissingSoftware
@Override @Override
protected List<UpdateInfo<eu.dnetlib.broker.objects.Software>> findUpdates( protected List<UpdateInfo<eu.dnetlib.broker.objects.Software>> findUpdates(
final Pair<Result, List<Software>> source, final Pair<Result, List<Software>> source,
final Pair<Result, List<Software>> target) { final Pair<Result, List<Software>> target,
final DedupConfig dedupConfig) {
if (source.getRight().isEmpty()) { if (source.getRight().isEmpty()) {
return Arrays.asList(); return Arrays.asList();
@ -33,21 +35,21 @@ public class EnrichMissingSoftware
.getRight() .getRight()
.stream() .stream()
.map(ConversionUtils::oafSoftwareToBrokerSoftware) .map(ConversionUtils::oafSoftwareToBrokerSoftware)
.map(p -> generateUpdateInfo(p, source, target)) .map(p -> generateUpdateInfo(p, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
} }
@Override
public UpdateInfo<eu.dnetlib.broker.objects.Software> generateUpdateInfo( public UpdateInfo<eu.dnetlib.broker.objects.Software> generateUpdateInfo(
final eu.dnetlib.broker.objects.Software highlightValue, final eu.dnetlib.broker.objects.Software highlightValue,
final Pair<Result, List<Software>> source, final Pair<Result, List<Software>> source,
final Pair<Result, List<Software>> target) { final Pair<Result, List<Software>> target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>( return new UpdateInfo<>(
Topic.ENRICH_MISSING_SOFTWARE, Topic.ENRICH_MISSING_SOFTWARE,
highlightValue, source.getLeft(), target.getLeft(), highlightValue, source.getLeft(), target.getLeft(),
(p, s) -> p.getSoftwares().add(s), (p, s) -> p.getSoftwares().add(s),
s -> s.getName()); s -> s.getName(), dedupConfig);
} }
} }

View File

@ -13,6 +13,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMoreSoftware public class EnrichMoreSoftware
extends UpdateMatcher<Pair<Result, List<Software>>, eu.dnetlib.broker.objects.Software> { extends UpdateMatcher<Pair<Result, List<Software>>, eu.dnetlib.broker.objects.Software> {
@ -24,7 +25,8 @@ public class EnrichMoreSoftware
@Override @Override
protected List<UpdateInfo<eu.dnetlib.broker.objects.Software>> findUpdates( protected List<UpdateInfo<eu.dnetlib.broker.objects.Software>> findUpdates(
final Pair<Result, List<Software>> source, final Pair<Result, List<Software>> source,
final Pair<Result, List<Software>> target) { final Pair<Result, List<Software>> target,
final DedupConfig dedupConfig) {
final Set<String> existingSoftwares = source final Set<String> existingSoftwares = source
.getRight() .getRight()
@ -37,20 +39,20 @@ public class EnrichMoreSoftware
.stream() .stream()
.filter(p -> !existingSoftwares.contains(p.getId())) .filter(p -> !existingSoftwares.contains(p.getId()))
.map(ConversionUtils::oafSoftwareToBrokerSoftware) .map(ConversionUtils::oafSoftwareToBrokerSoftware)
.map(p -> generateUpdateInfo(p, source, target)) .map(p -> generateUpdateInfo(p, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
@Override
public UpdateInfo<eu.dnetlib.broker.objects.Software> generateUpdateInfo( public UpdateInfo<eu.dnetlib.broker.objects.Software> generateUpdateInfo(
final eu.dnetlib.broker.objects.Software highlightValue, final eu.dnetlib.broker.objects.Software highlightValue,
final Pair<Result, List<Software>> source, final Pair<Result, List<Software>> source,
final Pair<Result, List<Software>> target) { final Pair<Result, List<Software>> target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>( return new UpdateInfo<>(
Topic.ENRICH_MORE_SOFTWARE, Topic.ENRICH_MORE_SOFTWARE,
highlightValue, source.getLeft(), target.getLeft(), highlightValue, source.getLeft(), target.getLeft(),
(p, s) -> p.getSoftwares().add(s), (p, s) -> p.getSoftwares().add(s),
s -> s.getName()); s -> s.getName(), dedupConfig);
} }
} }

View File

@ -9,6 +9,7 @@ import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMissingAbstract extends UpdateMatcher<Result, String> { public class EnrichMissingAbstract extends UpdateMatcher<Result, String> {
@ -17,22 +18,22 @@ public class EnrichMissingAbstract extends UpdateMatcher<Result, String> {
} }
@Override @Override
protected List<UpdateInfo<String>> findUpdates(final Result source, final Result target) { protected List<UpdateInfo<String>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) {
if (isMissing(target.getDescription()) && !isMissing(source.getDescription())) { if (isMissing(target.getDescription()) && !isMissing(source.getDescription())) {
return Arrays.asList(generateUpdateInfo(source.getDescription().get(0).getValue(), source, target)); return Arrays.asList(generateUpdateInfo(source.getDescription().get(0).getValue(), source, target, dedupConfig));
} }
return new ArrayList<>(); return new ArrayList<>();
} }
@Override
public UpdateInfo<String> generateUpdateInfo(final String highlightValue, public UpdateInfo<String> generateUpdateInfo(final String highlightValue,
final Result source, final Result source,
final Result target) { final Result target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>( return new UpdateInfo<>(
Topic.ENRICH_MISSING_ABSTRACT, Topic.ENRICH_MISSING_ABSTRACT,
highlightValue, source, target, highlightValue, source, target,
(p, s) -> p.getAbstracts().add(s), (p, s) -> p.getAbstracts().add(s),
s -> s); s -> s, dedupConfig);
} }
} }

View File

@ -10,6 +10,7 @@ import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMissingAuthorOrcid extends UpdateMatcher<Result, Pair<String, String>> { public class EnrichMissingAuthorOrcid extends UpdateMatcher<Result, Pair<String, String>> {
@ -18,19 +19,21 @@ public class EnrichMissingAuthorOrcid extends UpdateMatcher<Result, Pair<String,
} }
@Override @Override
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target) { protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) {
// TODO
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
return Arrays.asList(); return Arrays.asList();
} }
@Override
public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue, public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue,
final Result source, final Result source,
final Result target) { final Result target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>( return new UpdateInfo<>(
Topic.ENRICH_MISSING_AUTHOR_ORCID, Topic.ENRICH_MISSING_AUTHOR_ORCID,
highlightValue, source, target, highlightValue, source, target,
(p, pair) -> p.getCreators().add(pair.getLeft() + " - ORCID: " + pair.getRight()), (p, pair) -> p.getCreators().add(pair.getLeft() + " - ORCID: " + pair.getRight()),
pair -> pair.getLeft() + "::" + pair.getRight()); pair -> pair.getLeft() + "::" + pair.getRight(),
dedupConfig);
} }
} }

View File

@ -12,6 +12,7 @@ import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMissingOpenAccess extends UpdateMatcher<Result, Instance> { public class EnrichMissingOpenAccess extends UpdateMatcher<Result, Instance> {
@ -20,7 +21,7 @@ public class EnrichMissingOpenAccess extends UpdateMatcher<Result, Instance> {
} }
@Override @Override
protected List<UpdateInfo<Instance>> findUpdates(final Result source, final Result target) { protected List<UpdateInfo<Instance>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) {
final long count = target final long count = target
.getInstance() .getInstance()
.stream() .stream()
@ -28,9 +29,7 @@ public class EnrichMissingOpenAccess extends UpdateMatcher<Result, Instance> {
.filter(right -> right.equals(BrokerConstants.OPEN_ACCESS)) .filter(right -> right.equals(BrokerConstants.OPEN_ACCESS))
.count(); .count();
if (count > 0) { if (count > 0) { return Arrays.asList(); }
return Arrays.asList();
}
return source return source
.getInstance() .getInstance()
@ -38,19 +37,19 @@ public class EnrichMissingOpenAccess extends UpdateMatcher<Result, Instance> {
.filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS)) .filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS))
.map(ConversionUtils::oafInstanceToBrokerInstances) .map(ConversionUtils::oafInstanceToBrokerInstances)
.flatMap(List::stream) .flatMap(List::stream)
.map(i -> generateUpdateInfo(i, source, target)) .map(i -> generateUpdateInfo(i, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
@Override
public UpdateInfo<Instance> generateUpdateInfo(final Instance highlightValue, public UpdateInfo<Instance> generateUpdateInfo(final Instance highlightValue,
final Result source, final Result source,
final Result target) { final Result target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>( return new UpdateInfo<>(
Topic.ENRICH_MISSING_OA_VERSION, Topic.ENRICH_MISSING_OA_VERSION,
highlightValue, source, target, highlightValue, source, target,
(p, i) -> p.getInstances().add(i), (p, i) -> p.getInstances().add(i),
Instance::getUrl); Instance::getUrl, dedupConfig);
} }
} }

View File

@ -11,6 +11,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMissingPid extends UpdateMatcher<Result, Pid> { public class EnrichMissingPid extends UpdateMatcher<Result, Pid> {
@ -19,28 +20,25 @@ public class EnrichMissingPid extends UpdateMatcher<Result, Pid> {
} }
@Override @Override
protected List<UpdateInfo<Pid>> findUpdates(final Result source, final Result target) { protected List<UpdateInfo<Pid>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) {
final long count = target.getPid().size(); final long count = target.getPid().size();
if (count > 0) { if (count > 0) { return Arrays.asList(); }
return Arrays.asList();
}
return source return source
.getPid() .getPid()
.stream() .stream()
.map(ConversionUtils::oafPidToBrokerPid) .map(ConversionUtils::oafPidToBrokerPid)
.map(i -> generateUpdateInfo(i, source, target)) .map(i -> generateUpdateInfo(i, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
@Override public UpdateInfo<Pid> generateUpdateInfo(final Pid highlightValue, final Result source, final Result target, final DedupConfig dedupConfig) {
public UpdateInfo<Pid> generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) {
return new UpdateInfo<>( return new UpdateInfo<>(
Topic.ENRICH_MISSING_PID, Topic.ENRICH_MISSING_PID,
highlightValue, source, target, highlightValue, source, target,
(p, pid) -> p.getPids().add(pid), (p, pid) -> p.getPids().add(pid),
pid -> pid.getType() + "::" + pid.getValue()); pid -> pid.getType() + "::" + pid.getValue(), dedupConfig);
} }
} }

View File

@ -9,6 +9,7 @@ import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMissingPublicationDate extends UpdateMatcher<Result, String> { public class EnrichMissingPublicationDate extends UpdateMatcher<Result, String> {
@ -17,22 +18,22 @@ public class EnrichMissingPublicationDate extends UpdateMatcher<Result, String>
} }
@Override @Override
protected List<UpdateInfo<String>> findUpdates(final Result source, final Result target) { protected List<UpdateInfo<String>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) {
if (isMissing(target.getDateofacceptance()) && !isMissing(source.getDateofacceptance())) { if (isMissing(target.getDateofacceptance()) && !isMissing(source.getDateofacceptance())) {
return Arrays.asList(generateUpdateInfo(source.getDateofacceptance().getValue(), source, target)); return Arrays.asList(generateUpdateInfo(source.getDateofacceptance().getValue(), source, target, dedupConfig));
} }
return new ArrayList<>(); return new ArrayList<>();
} }
@Override
public UpdateInfo<String> generateUpdateInfo(final String highlightValue, public UpdateInfo<String> generateUpdateInfo(final String highlightValue,
final Result source, final Result source,
final Result target) { final Result target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>( return new UpdateInfo<>(
Topic.ENRICH_MISSING_PUBLICATION_DATE, Topic.ENRICH_MISSING_PUBLICATION_DATE,
highlightValue, source, target, highlightValue, source, target,
(p, date) -> p.setPublicationdate(date), (p, date) -> p.setPublicationdate(date),
s -> s); s -> s, dedupConfig);
} }
} }

View File

@ -14,6 +14,7 @@ import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMissingSubject extends UpdateMatcher<Result, Pair<String, String>> { public class EnrichMissingSubject extends UpdateMatcher<Result, Pair<String, String>> {
@ -22,7 +23,7 @@ public class EnrichMissingSubject extends UpdateMatcher<Result, Pair<String, Str
} }
@Override @Override
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target) { protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) {
final Set<String> existingTypes = target final Set<String> existingTypes = target
.getSubject() .getSubject()
.stream() .stream()
@ -35,20 +36,20 @@ public class EnrichMissingSubject extends UpdateMatcher<Result, Pair<String, Str
.stream() .stream()
.filter(pid -> !existingTypes.contains(pid.getQualifier().getClassid())) .filter(pid -> !existingTypes.contains(pid.getQualifier().getClassid()))
.map(ConversionUtils::oafSubjectToPair) .map(ConversionUtils::oafSubjectToPair)
.map(i -> generateUpdateInfo(i, source, target)) .map(i -> generateUpdateInfo(i, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
@Override
public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue, public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue,
final Result source, final Result source,
final Result target) { final Result target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>( return new UpdateInfo<>(
Topic.fromPath("ENRICH/MISSING/SUBJECT/" + highlightValue.getLeft()), Topic.fromPath("ENRICH/MISSING/SUBJECT/" + highlightValue.getLeft()),
highlightValue, source, target, highlightValue, source, target,
(p, pair) -> p.getSubjects().add(pair.getRight()), (p, pair) -> p.getSubjects().add(pair.getRight()),
pair -> pair.getLeft() + "::" + pair.getRight()); pair -> pair.getLeft() + "::" + pair.getRight(), dedupConfig);
} }
} }

View File

@ -12,6 +12,7 @@ import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMoreOpenAccess extends UpdateMatcher<Result, Instance> { public class EnrichMoreOpenAccess extends UpdateMatcher<Result, Instance> {
@ -20,7 +21,7 @@ public class EnrichMoreOpenAccess extends UpdateMatcher<Result, Instance> {
} }
@Override @Override
protected List<UpdateInfo<Instance>> findUpdates(final Result source, final Result target) { protected List<UpdateInfo<Instance>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) {
final Set<String> urls = target final Set<String> urls = target
.getInstance() .getInstance()
.stream() .stream()
@ -36,19 +37,19 @@ public class EnrichMoreOpenAccess extends UpdateMatcher<Result, Instance> {
.map(ConversionUtils::oafInstanceToBrokerInstances) .map(ConversionUtils::oafInstanceToBrokerInstances)
.flatMap(List::stream) .flatMap(List::stream)
.filter(i -> !urls.contains(i.getUrl())) .filter(i -> !urls.contains(i.getUrl()))
.map(i -> generateUpdateInfo(i, source, target)) .map(i -> generateUpdateInfo(i, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
@Override
public UpdateInfo<Instance> generateUpdateInfo(final Instance highlightValue, public UpdateInfo<Instance> generateUpdateInfo(final Instance highlightValue,
final Result source, final Result source,
final Result target) { final Result target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>( return new UpdateInfo<>(
Topic.ENRICH_MORE_OA_VERSION, Topic.ENRICH_MORE_OA_VERSION,
highlightValue, source, target, highlightValue, source, target,
(p, i) -> p.getInstances().add(i), (p, i) -> p.getInstances().add(i),
Instance::getUrl); Instance::getUrl, dedupConfig);
} }
} }

View File

@ -11,6 +11,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMorePid extends UpdateMatcher<Result, Pid> { public class EnrichMorePid extends UpdateMatcher<Result, Pid> {
@ -19,7 +20,7 @@ public class EnrichMorePid extends UpdateMatcher<Result, Pid> {
} }
@Override @Override
protected List<UpdateInfo<Pid>> findUpdates(final Result source, final Result target) { protected List<UpdateInfo<Pid>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) {
final Set<String> existingPids = target final Set<String> existingPids = target
.getPid() .getPid()
.stream() .stream()
@ -31,17 +32,16 @@ public class EnrichMorePid extends UpdateMatcher<Result, Pid> {
.stream() .stream()
.filter(pid -> !existingPids.contains(pid.getQualifier().getClassid() + "::" + pid.getValue())) .filter(pid -> !existingPids.contains(pid.getQualifier().getClassid() + "::" + pid.getValue()))
.map(ConversionUtils::oafPidToBrokerPid) .map(ConversionUtils::oafPidToBrokerPid)
.map(i -> generateUpdateInfo(i, source, target)) .map(i -> generateUpdateInfo(i, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
@Override public UpdateInfo<Pid> generateUpdateInfo(final Pid highlightValue, final Result source, final Result target, final DedupConfig dedupConfig) {
public UpdateInfo<Pid> generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) {
return new UpdateInfo<>( return new UpdateInfo<>(
Topic.ENRICH_MORE_PID, Topic.ENRICH_MORE_PID,
highlightValue, source, target, highlightValue, source, target,
(p, pid) -> p.getPids().add(pid), (p, pid) -> p.getPids().add(pid),
pid -> pid.getType() + "::" + pid.getValue()); pid -> pid.getType() + "::" + pid.getValue(), dedupConfig);
} }
} }

View File

@ -12,6 +12,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMoreSubject extends UpdateMatcher<Result, Pair<String, String>> { public class EnrichMoreSubject extends UpdateMatcher<Result, Pair<String, String>> {
@ -20,7 +21,7 @@ public class EnrichMoreSubject extends UpdateMatcher<Result, Pair<String, String
} }
@Override @Override
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target) { protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) {
final Set<String> existingSubjects = target final Set<String> existingSubjects = target
.getSubject() .getSubject()
.stream() .stream()
@ -32,20 +33,20 @@ public class EnrichMoreSubject extends UpdateMatcher<Result, Pair<String, String
.stream() .stream()
.filter(pid -> !existingSubjects.contains(pid.getQualifier().getClassid() + "::" + pid.getValue())) .filter(pid -> !existingSubjects.contains(pid.getQualifier().getClassid() + "::" + pid.getValue()))
.map(ConversionUtils::oafSubjectToPair) .map(ConversionUtils::oafSubjectToPair)
.map(i -> generateUpdateInfo(i, source, target)) .map(i -> generateUpdateInfo(i, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
@Override
public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue, public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue,
final Result source, final Result source,
final Result target) { final Result target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>( return new UpdateInfo<>(
Topic.fromPath("ENRICH/MORE/SUBJECT/" + highlightValue.getLeft()), Topic.fromPath("ENRICH/MORE/SUBJECT/" + highlightValue.getLeft()),
highlightValue, source, target, highlightValue, source, target,
(p, pair) -> p.getSubjects().add(pair.getRight()), (p, pair) -> p.getSubjects().add(pair.getRight()),
pair -> pair.getLeft() + "::" + pair.getRight()); pair -> pair.getLeft() + "::" + pair.getRight(), dedupConfig);
} }
} }

View File

@ -15,6 +15,9 @@ public class BrokerConstants {
public static final String OPEN_ACCESS = "OPEN"; public static final String OPEN_ACCESS = "OPEN";
public static final String IS_MERGED_IN_CLASS = "isMergedIn"; public static final String IS_MERGED_IN_CLASS = "isMergedIn";
public static final float MIN_TRUST = 0.25f;
public static final float MAX_TRUST = 1.00f;
public static final List<Class<? extends Result>> RESULT_CLASSES = Arrays public static final List<Class<? extends Result>> RESULT_CLASSES = Arrays
.asList(Publication.class, Dataset.class, Software.class, OtherResearchProduct.class); .asList(Publication.class, Dataset.class, Software.class, OtherResearchProduct.class);

View File

@ -0,0 +1,15 @@
package eu.dnetlib.dhp.broker.oa.util;
public class TrustUtils {
public static float rescale(final double score, final double threshold) {
if (score >= BrokerConstants.MAX_TRUST) { return BrokerConstants.MAX_TRUST; }
final double val = (score - threshold) * (BrokerConstants.MAX_TRUST - BrokerConstants.MIN_TRUST) / (BrokerConstants.MAX_TRUST - threshold);
if (val < BrokerConstants.MIN_TRUST) { return BrokerConstants.MIN_TRUST; }
if (val > BrokerConstants.MAX_TRUST) { return BrokerConstants.MAX_TRUST; }
return (float) val;
}
}

View File

@ -5,6 +5,11 @@ import java.util.List;
import java.util.function.BiConsumer; import java.util.function.BiConsumer;
import java.util.function.Function; import java.util.function.Function;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.broker.objects.OpenAireEventPayload; import eu.dnetlib.broker.objects.OpenAireEventPayload;
import eu.dnetlib.broker.objects.Provenance; import eu.dnetlib.broker.objects.Provenance;
import eu.dnetlib.broker.objects.Publication; import eu.dnetlib.broker.objects.Publication;
@ -12,6 +17,10 @@ import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.schema.oaf.Instance; import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.tree.support.TreeProcessor;
import eu.dnetlib.pace.util.MapDocumentUtil;
public final class UpdateInfo<T> { public final class UpdateInfo<T> {
@ -29,16 +38,19 @@ public final class UpdateInfo<T> {
private final float trust; private final float trust;
private static final Logger log = LoggerFactory.getLogger(UpdateInfo.class);
public UpdateInfo(final Topic topic, final T highlightValue, final Result source, final Result target, public UpdateInfo(final Topic topic, final T highlightValue, final Result source, final Result target,
final BiConsumer<Publication, T> compileHighlight, final BiConsumer<Publication, T> compileHighlight,
final Function<T, String> highlightToString) { final Function<T, String> highlightToString,
final DedupConfig dedupConfig) {
this.topic = topic; this.topic = topic;
this.highlightValue = highlightValue; this.highlightValue = highlightValue;
this.source = source; this.source = source;
this.target = target; this.target = target;
this.compileHighlight = compileHighlight; this.compileHighlight = compileHighlight;
this.highlightToString = highlightToString; this.highlightToString = highlightToString;
this.trust = calculateTrust(source, target); this.trust = calculateTrust(dedupConfig, source, target);
} }
public T getHighlightValue() { public T getHighlightValue() {
@ -53,9 +65,20 @@ public final class UpdateInfo<T> {
return target; return target;
} }
private float calculateTrust(final Result source, final Result target) { private float calculateTrust(final DedupConfig dedupConfig, final Result r1, final Result r2) {
// TODO try {
return 0.9f; final ObjectMapper objectMapper = new ObjectMapper();
final MapDocument doc1 = MapDocumentUtil.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r1));
final MapDocument doc2 = MapDocumentUtil.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r2));
final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2);
final double threshold = dedupConfig.getWf().getThreshold();
return TrustUtils.rescale(score, threshold);
} catch (final Exception e) {
log.error("Error computing score between results", e);
return BrokerConstants.MIN_TRUST;
}
} }
protected Topic getTopic() { protected Topic getTopic() {
@ -95,8 +118,7 @@ public final class UpdateInfo<T> {
.map(Instance::getUrl) .map(Instance::getUrl)
.flatMap(List::stream) .flatMap(List::stream)
.findFirst() .findFirst()
.orElse(null); .orElse(null);;
;
final Provenance provenance = new Provenance().setId(provId).setRepositoryName(provRepo).setUrl(provUrl); final Provenance provenance = new Provenance().setId(provId).setRepositoryName(provRepo).setUrl(provUrl);

View File

@ -0,0 +1,72 @@
package eu.dnetlib.dhp.broker.oa.util;
import static org.junit.jupiter.api.Assertions.assertTrue;
import org.junit.jupiter.api.Test;
public class TrustUtilsTest {
private static final double THRESHOLD = 0.95;
@Test
public void rescaleTest_1() {
verifyValue(-0.3, BrokerConstants.MIN_TRUST);
}
@Test
public void rescaleTest_2() {
verifyValue(0.0, BrokerConstants.MIN_TRUST);
}
@Test
public void rescaleTest_3() {
verifyValue(0.5, BrokerConstants.MIN_TRUST);
}
@Test
public void rescaleTest_4() {
verifyValue(0.95, BrokerConstants.MIN_TRUST);
}
@Test
public void rescaleTest_5() {
verifyValue(0.96, BrokerConstants.MIN_TRUST);
}
@Test
public void rescaleTest_6() {
verifyValue(0.97, 0.3f);
}
@Test
public void rescaleTest_7() {
verifyValue(0.98, 0.45f);
}
@Test
public void rescaleTest_8() {
verifyValue(0.99, 0.6f);
}
@Test
public void rescaleTest_9() {
verifyValue(1.00, BrokerConstants.MAX_TRUST);
}
@Test
public void rescaleTest_10() {
verifyValue(1.01, BrokerConstants.MAX_TRUST);
}
@Test
public void rescaleTest_11() {
verifyValue(2.00, BrokerConstants.MAX_TRUST);
}
private void verifyValue(final double originalScore, final float expectedTrust) {
final float trust = TrustUtils.rescale(originalScore, THRESHOLD);
System.out.println(trust);
assertTrue(Math.abs(trust - expectedTrust) < 0.01);
}
}