enrichment steps #38
|
@ -91,35 +91,29 @@ public class GenerateEventsApplication {
|
|||
private static final UpdateMatcher<Pair<Result, List<Software>>, ?> enrichMoreSoftware = new EnrichMoreSoftware();
|
||||
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMisissingPublicationIsRelatedTo = new EnrichMissingPublicationIsRelatedTo();
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsReferencedBy =
|
||||
new EnrichMissingPublicationIsReferencedBy();
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsReferencedBy = new EnrichMissingPublicationIsReferencedBy();
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationReferences = new EnrichMissingPublicationReferences();
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsSupplementedTo =
|
||||
new EnrichMissingPublicationIsSupplementedTo();
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsSupplementedBy =
|
||||
new EnrichMissingPublicationIsSupplementedBy();
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsSupplementedTo = new EnrichMissingPublicationIsSupplementedTo();
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsSupplementedBy = new EnrichMissingPublicationIsSupplementedBy();
|
||||
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMisissingDatasetIsRelatedTo =
|
||||
new EnrichMissingDatasetIsRelatedTo();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsReferencedBy =
|
||||
new EnrichMissingDatasetIsReferencedBy();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetReferences =
|
||||
new EnrichMissingDatasetReferences();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedTo =
|
||||
new EnrichMissingDatasetIsSupplementedTo();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedBy =
|
||||
new EnrichMissingDatasetIsSupplementedBy();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMisissingDatasetIsRelatedTo = new EnrichMissingDatasetIsRelatedTo();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsReferencedBy = new EnrichMissingDatasetIsReferencedBy();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetReferences = new EnrichMissingDatasetReferences();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedTo = new EnrichMissingDatasetIsSupplementedTo();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedBy = new EnrichMissingDatasetIsSupplementedBy();
|
||||
|
||||
// Aggregators
|
||||
private static final TypedColumn<Tuple2<Result, Relation>, ResultGroup> resultAggrTypedColumn = new ResultAggregator().toColumn();
|
||||
private static final TypedColumn<Tuple2<Result, Relation>, ResultGroup> resultAggrTypedColumn = new ResultAggregator()
|
||||
.toColumn();
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(GenerateEventsApplication.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json")));
|
||||
.toString(
|
||||
GenerateEventsApplication.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
|
@ -172,18 +166,23 @@ public class GenerateEventsApplication {
|
|||
final Class<R> resultClazz,
|
||||
final DedupConfig dedupConfig) {
|
||||
|
||||
final Dataset<Result> results = readPath(spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), Result.class)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference());
|
||||
final Dataset<Result> results = readPath(
|
||||
spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), Result.class)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference());
|
||||
|
||||
final Dataset<Relation> mergedRels = readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
|
||||
|
||||
return results.joinWith(mergedRels, results.col("id").equalTo(mergedRels.col("source")), "inner")
|
||||
return results
|
||||
.joinWith(mergedRels, results.col("id").equalTo(mergedRels.col("source")), "inner")
|
||||
.groupByKey((MapFunction<Tuple2<Result, Relation>, String>) t -> t._2.getTarget(), Encoders.STRING())
|
||||
.agg(resultAggrTypedColumn)
|
||||
.map((MapFunction<Tuple2<String, ResultGroup>, ResultGroup>) t -> t._2, Encoders.kryo(ResultGroup.class))
|
||||
.filter(ResultGroup::isValid)
|
||||
.map((MapFunction<ResultGroup, EventGroup>) g -> GenerateEventsApplication.generateSimpleEvents(g, dedupConfig), Encoders.kryo(EventGroup.class))
|
||||
.map(
|
||||
(MapFunction<ResultGroup, EventGroup>) g -> GenerateEventsApplication
|
||||
.generateSimpleEvents(g, dedupConfig),
|
||||
Encoders.kryo(EventGroup.class))
|
||||
.flatMap(group -> group.getData().iterator(), Encoders.kryo(Event.class));
|
||||
}
|
||||
|
||||
|
@ -207,16 +206,19 @@ public class GenerateEventsApplication {
|
|||
return events;
|
||||
}
|
||||
|
||||
private static <SRC extends Result, TRG extends OafEntity> Dataset<Event> generateRelationEvents(final SparkSession spark,
|
||||
private static <SRC extends Result, TRG extends OafEntity> Dataset<Event> generateRelationEvents(
|
||||
final SparkSession spark,
|
||||
final String graphPath,
|
||||
final Class<SRC> sourceClass,
|
||||
final Class<TRG> targetClass,
|
||||
final DedupConfig dedupConfig) {
|
||||
|
||||
final Dataset<Result> sources = readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), Result.class)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference());
|
||||
final Dataset<Result> sources = readPath(
|
||||
spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), Result.class)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference());
|
||||
|
||||
final Dataset<TRG> targets = readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), targetClass);
|
||||
final Dataset<TRG> targets = readPath(
|
||||
spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), targetClass);
|
||||
|
||||
final Dataset<Relation> mergedRels = readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
|
||||
|
@ -224,7 +226,8 @@ public class GenerateEventsApplication {
|
|||
final Dataset<Relation> rels = readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
|
||||
|
||||
final Dataset<ResultGroup> duplicates = sources.joinWith(mergedRels, sources.col("id").equalTo(rels.col("source")), "inner")
|
||||
final Dataset<ResultGroup> duplicates = sources
|
||||
.joinWith(mergedRels, sources.col("id").equalTo(rels.col("source")), "inner")
|
||||
.groupByKey((MapFunction<Tuple2<Result, Relation>, String>) t -> t._2.getTarget(), Encoders.STRING())
|
||||
.agg(resultAggrTypedColumn)
|
||||
.map((MapFunction<Tuple2<String, ResultGroup>, ResultGroup>) t -> t._2, Encoders.kryo(ResultGroup.class))
|
||||
|
@ -243,7 +246,8 @@ public class GenerateEventsApplication {
|
|||
return null;
|
||||
}
|
||||
|
||||
private List<Event> generateProjectsEvents(final Collection<Pair<Result, List<Project>>> childrenWithProjects, final DedupConfig dedupConfig) {
|
||||
private List<Event> generateProjectsEvents(final Collection<Pair<Result, List<Project>>> childrenWithProjects,
|
||||
final DedupConfig dedupConfig) {
|
||||
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||
|
||||
for (final Pair<Result, List<Project>> target : childrenWithProjects) {
|
||||
|
@ -254,7 +258,8 @@ public class GenerateEventsApplication {
|
|||
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private List<Event> generateSoftwareEvents(final Collection<Pair<Result, List<Software>>> childrenWithSoftwares, final DedupConfig dedupConfig) {
|
||||
private List<Event> generateSoftwareEvents(final Collection<Pair<Result, List<Software>>> childrenWithSoftwares,
|
||||
final DedupConfig dedupConfig) {
|
||||
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||
|
||||
for (final Pair<Result, List<Software>> target : childrenWithSoftwares) {
|
||||
|
@ -279,15 +284,30 @@ public class GenerateEventsApplication {
|
|||
|
||||
for (final Pair<Result, List<Publication>> target : cleanedChildrens) {
|
||||
if (relType.equals("isRelatedTo")) {
|
||||
list.addAll(enrichMisissingPublicationIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
|
||||
list
|
||||
.addAll(
|
||||
enrichMisissingPublicationIsRelatedTo
|
||||
.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
|
||||
} else if (relType.equals("references")) {
|
||||
list.addAll(enrichMissingPublicationReferences.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
|
||||
list
|
||||
.addAll(
|
||||
enrichMissingPublicationReferences
|
||||
.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
|
||||
} else if (relType.equals("isReferencedBy")) {
|
||||
list.addAll(enrichMissingPublicationIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
|
||||
list
|
||||
.addAll(
|
||||
enrichMissingPublicationIsReferencedBy
|
||||
.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
|
||||
} else if (relType.equals("isSupplementedTo")) {
|
||||
list.addAll(enrichMissingPublicationIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
|
||||
list
|
||||
.addAll(
|
||||
enrichMissingPublicationIsSupplementedTo
|
||||
.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
|
||||
} else if (relType.equals("isSupplementedBy")) {
|
||||
list.addAll(enrichMissingPublicationIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
|
||||
list
|
||||
.addAll(
|
||||
enrichMissingPublicationIsSupplementedBy
|
||||
.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -310,15 +330,29 @@ public class GenerateEventsApplication {
|
|||
|
||||
for (final Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>> target : cleanedChildrens) {
|
||||
if (relType.equals("isRelatedTo")) {
|
||||
list.addAll(enrichMisissingDatasetIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
|
||||
list
|
||||
.addAll(
|
||||
enrichMisissingDatasetIsRelatedTo
|
||||
.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
|
||||
} else if (relType.equals("references")) {
|
||||
list.addAll(enrichMissingDatasetReferences.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
|
||||
list
|
||||
.addAll(
|
||||
enrichMissingDatasetReferences.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
|
||||
} else if (relType.equals("isReferencedBy")) {
|
||||
list.addAll(enrichMissingDatasetIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
|
||||
list
|
||||
.addAll(
|
||||
enrichMissingDatasetIsReferencedBy
|
||||
.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
|
||||
} else if (relType.equals("isSupplementedTo")) {
|
||||
list.addAll(enrichMissingDatasetIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
|
||||
list
|
||||
.addAll(
|
||||
enrichMissingDatasetIsSupplementedTo
|
||||
.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
|
||||
} else if (relType.equals("isSupplementedBy")) {
|
||||
list.addAll(enrichMissingDatasetIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
|
||||
list
|
||||
.addAll(
|
||||
enrichMissingDatasetIsSupplementedBy
|
||||
.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -339,8 +373,12 @@ public class GenerateEventsApplication {
|
|||
private static DedupConfig loadDedupConfig(final String isLookupUrl, final String profId) throws Exception {
|
||||
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
|
||||
final String conf = isLookUpService.getResourceProfileByQuery(String
|
||||
.format("for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", profId));
|
||||
final String conf = isLookUpService
|
||||
.getResourceProfileByQuery(
|
||||
String
|
||||
.format(
|
||||
"for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()",
|
||||
profId));
|
||||
|
||||
final DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class);
|
||||
dedupConfig.getPace().initModel();
|
||||
|
|
|
@ -22,7 +22,8 @@ public abstract class UpdateMatcher<K, T> {
|
|||
this.multipleUpdate = multipleUpdate;
|
||||
}
|
||||
|
||||
public Collection<UpdateInfo<T>> searchUpdatesForRecord(final K res, final Collection<K> others, final DedupConfig dedupConfig) {
|
||||
public Collection<UpdateInfo<T>> searchUpdatesForRecord(final K res, final Collection<K> others,
|
||||
final DedupConfig dedupConfig) {
|
||||
|
||||
final Map<String, UpdateInfo<T>> infoMap = new HashMap<>();
|
||||
|
||||
|
@ -30,7 +31,8 @@ public abstract class UpdateMatcher<K, T> {
|
|||
if (source != res) {
|
||||
for (final UpdateInfo<T> info : findUpdates(source, res, dedupConfig)) {
|
||||
final String s = DigestUtils.md5Hex(info.getHighlightValueAsString());
|
||||
if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {} else {
|
||||
if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {
|
||||
} else {
|
||||
infoMap.put(s, info);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,9 +18,11 @@ public class EnrichMissingAbstract extends UpdateMatcher<Result, String> {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<String>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) {
|
||||
protected List<UpdateInfo<String>> findUpdates(final Result source, final Result target,
|
||||
final DedupConfig dedupConfig) {
|
||||
if (isMissing(target.getDescription()) && !isMissing(source.getDescription())) {
|
||||
return Arrays.asList(generateUpdateInfo(source.getDescription().get(0).getValue(), source, target, dedupConfig));
|
||||
return Arrays
|
||||
.asList(generateUpdateInfo(source.getDescription().get(0).getValue(), source, target, dedupConfig));
|
||||
}
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
|
|
@ -19,7 +19,8 @@ public class EnrichMissingAuthorOrcid extends UpdateMatcher<Result, Pair<String,
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) {
|
||||
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target,
|
||||
final DedupConfig dedupConfig) {
|
||||
// TODO
|
||||
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
||||
return Arrays.asList();
|
||||
|
|
|
@ -21,7 +21,8 @@ public class EnrichMissingOpenAccess extends UpdateMatcher<Result, Instance> {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<Instance>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) {
|
||||
protected List<UpdateInfo<Instance>> findUpdates(final Result source, final Result target,
|
||||
final DedupConfig dedupConfig) {
|
||||
final long count = target
|
||||
.getInstance()
|
||||
.stream()
|
||||
|
@ -29,7 +30,9 @@ public class EnrichMissingOpenAccess extends UpdateMatcher<Result, Instance> {
|
|||
.filter(right -> right.equals(BrokerConstants.OPEN_ACCESS))
|
||||
.count();
|
||||
|
||||
if (count > 0) { return Arrays.asList(); }
|
||||
if (count > 0) {
|
||||
return Arrays.asList();
|
||||
}
|
||||
|
||||
return source
|
||||
.getInstance()
|
||||
|
|
|
@ -20,10 +20,13 @@ public class EnrichMissingPid extends UpdateMatcher<Result, Pid> {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<Pid>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) {
|
||||
protected List<UpdateInfo<Pid>> findUpdates(final Result source, final Result target,
|
||||
final DedupConfig dedupConfig) {
|
||||
final long count = target.getPid().size();
|
||||
|
||||
if (count > 0) { return Arrays.asList(); }
|
||||
if (count > 0) {
|
||||
return Arrays.asList();
|
||||
}
|
||||
|
||||
return source
|
||||
.getPid()
|
||||
|
@ -33,7 +36,8 @@ public class EnrichMissingPid extends UpdateMatcher<Result, Pid> {
|
|||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public UpdateInfo<Pid> generateUpdateInfo(final Pid highlightValue, final Result source, final Result target, final DedupConfig dedupConfig) {
|
||||
public UpdateInfo<Pid> generateUpdateInfo(final Pid highlightValue, final Result source, final Result target,
|
||||
final DedupConfig dedupConfig) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MISSING_PID,
|
||||
highlightValue, source, target,
|
||||
|
|
|
@ -18,9 +18,11 @@ public class EnrichMissingPublicationDate extends UpdateMatcher<Result, String>
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<String>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) {
|
||||
protected List<UpdateInfo<String>> findUpdates(final Result source, final Result target,
|
||||
final DedupConfig dedupConfig) {
|
||||
if (isMissing(target.getDateofacceptance()) && !isMissing(source.getDateofacceptance())) {
|
||||
return Arrays.asList(generateUpdateInfo(source.getDateofacceptance().getValue(), source, target, dedupConfig));
|
||||
return Arrays
|
||||
.asList(generateUpdateInfo(source.getDateofacceptance().getValue(), source, target, dedupConfig));
|
||||
}
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
|
|
@ -23,7 +23,8 @@ public class EnrichMissingSubject extends UpdateMatcher<Result, Pair<String, Str
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) {
|
||||
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target,
|
||||
final DedupConfig dedupConfig) {
|
||||
final Set<String> existingTypes = target
|
||||
.getSubject()
|
||||
.stream()
|
||||
|
|
|
@ -21,7 +21,8 @@ public class EnrichMoreOpenAccess extends UpdateMatcher<Result, Instance> {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<Instance>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) {
|
||||
protected List<UpdateInfo<Instance>> findUpdates(final Result source, final Result target,
|
||||
final DedupConfig dedupConfig) {
|
||||
final Set<String> urls = target
|
||||
.getInstance()
|
||||
.stream()
|
||||
|
|
|
@ -20,7 +20,8 @@ public class EnrichMorePid extends UpdateMatcher<Result, Pid> {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<Pid>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) {
|
||||
protected List<UpdateInfo<Pid>> findUpdates(final Result source, final Result target,
|
||||
final DedupConfig dedupConfig) {
|
||||
final Set<String> existingPids = target
|
||||
.getPid()
|
||||
.stream()
|
||||
|
@ -36,7 +37,8 @@ public class EnrichMorePid extends UpdateMatcher<Result, Pid> {
|
|||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public UpdateInfo<Pid> generateUpdateInfo(final Pid highlightValue, final Result source, final Result target, final DedupConfig dedupConfig) {
|
||||
public UpdateInfo<Pid> generateUpdateInfo(final Pid highlightValue, final Result source, final Result target,
|
||||
final DedupConfig dedupConfig) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MORE_PID,
|
||||
highlightValue, source, target,
|
||||
|
|
|
@ -21,7 +21,8 @@ public class EnrichMoreSubject extends UpdateMatcher<Result, Pair<String, String
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) {
|
||||
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target,
|
||||
final DedupConfig dedupConfig) {
|
||||
final Set<String> existingSubjects = target
|
||||
.getSubject()
|
||||
.stream()
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util;
|
||||
|
||||
import org.apache.spark.sql.Encoder;
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
|
|
@ -1,14 +1,22 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util;
|
||||
|
||||
public class TrustUtils {
|
||||
|
||||
public static float rescale(final double score, final double threshold) {
|
||||
if (score >= BrokerConstants.MAX_TRUST) { return BrokerConstants.MAX_TRUST; }
|
||||
if (score >= BrokerConstants.MAX_TRUST) {
|
||||
return BrokerConstants.MAX_TRUST;
|
||||
}
|
||||
|
||||
final double val = (score - threshold) * (BrokerConstants.MAX_TRUST - BrokerConstants.MIN_TRUST) / (BrokerConstants.MAX_TRUST - threshold);
|
||||
final double val = (score - threshold) * (BrokerConstants.MAX_TRUST - BrokerConstants.MIN_TRUST)
|
||||
/ (BrokerConstants.MAX_TRUST - threshold);
|
||||
|
||||
if (val < BrokerConstants.MIN_TRUST) { return BrokerConstants.MIN_TRUST; }
|
||||
if (val > BrokerConstants.MAX_TRUST) { return BrokerConstants.MAX_TRUST; }
|
||||
if (val < BrokerConstants.MIN_TRUST) {
|
||||
return BrokerConstants.MIN_TRUST;
|
||||
}
|
||||
if (val > BrokerConstants.MAX_TRUST) {
|
||||
return BrokerConstants.MAX_TRUST;
|
||||
}
|
||||
|
||||
return (float) val;
|
||||
}
|
||||
|
|
|
@ -68,8 +68,10 @@ public final class UpdateInfo<T> {
|
|||
private float calculateTrust(final DedupConfig dedupConfig, final Result r1, final Result r2) {
|
||||
try {
|
||||
final ObjectMapper objectMapper = new ObjectMapper();
|
||||
final MapDocument doc1 = MapDocumentUtil.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r1));
|
||||
final MapDocument doc2 = MapDocumentUtil.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r2));
|
||||
final MapDocument doc1 = MapDocumentUtil
|
||||
.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r1));
|
||||
final MapDocument doc2 = MapDocumentUtil
|
||||
.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r2));
|
||||
|
||||
final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2);
|
||||
final double threshold = dedupConfig.getWf().getThreshold();
|
||||
|
@ -118,7 +120,8 @@ public final class UpdateInfo<T> {
|
|||
.map(Instance::getUrl)
|
||||
.flatMap(List::stream)
|
||||
.findFirst()
|
||||
.orElse(null);;
|
||||
.orElse(null);
|
||||
;
|
||||
|
||||
final Provenance provenance = new Provenance().setId(provId).setRepositoryName(provRepo).setUrl(provUrl);
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
|
Loading…
Reference in New Issue