enrichment steps #38

Merged
claudio.atzori merged 334 commits from miriam.baglioni/dnet-hadoop:master into enrichment_wfs 2020-08-11 16:40:26 +02:00
17 changed files with 51 additions and 39 deletions
Showing only changes of commit 2393d9da2f - Show all commits

View File

@ -1,13 +1,14 @@
package eu.dnetlib.dhp.broker.oa.matchers;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.BiConsumer;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
@ -19,15 +20,15 @@ import eu.dnetlib.pace.config.DedupConfig;
public abstract class UpdateMatcher<T> {
private final boolean multipleUpdate;
private final int maxNumber;
private final Function<T, Topic> topicFunction;
private final BiConsumer<OaBrokerMainEntity, T> compileHighlightFunction;
private final Function<T, String> highlightToStringFunction;
public UpdateMatcher(final boolean multipleUpdate, final Function<T, Topic> topicFunction,
public UpdateMatcher(final int maxNumber, final Function<T, Topic> topicFunction,
final BiConsumer<OaBrokerMainEntity, T> compileHighlightFunction,
final Function<T, String> highlightToStringFunction) {
this.multipleUpdate = multipleUpdate;
this.maxNumber = maxNumber;
this.topicFunction = topicFunction;
this.compileHighlightFunction = compileHighlightFunction;
this.highlightToStringFunction = highlightToStringFunction;
@ -57,17 +58,19 @@ public abstract class UpdateMatcher<T> {
}
}
final Collection<UpdateInfo<T>> values = infoMap.values();
if (values.isEmpty() || multipleUpdate) {
return values;
} else {
final UpdateInfo<T> v = values
final List<UpdateInfo<T>> values = infoMap
.values()
.stream()
.sorted((o1, o2) -> Float.compare(o1.getTrust(), o2.getTrust()))
.findFirst()
.get();
return Arrays.asList(v);
.sorted((o1, o2) -> Float.compare(o2.getTrust(), o1.getTrust())) // DESCENDING
.collect(Collectors.toList());
if (values.isEmpty()) {
return new ArrayList<>();
} else if (values.size() > maxNumber) {
System.err.println("Too many events (" + values.size() + ") matched by " + getClass().getSimpleName());
return values.subList(0, maxNumber);
} else {
return values;
}
}
@ -81,8 +84,8 @@ public abstract class UpdateMatcher<T> {
return StringUtils.isBlank(field);
}
public boolean isMultipleUpdate() {
return multipleUpdate;
public int getMaxNumber() {
return maxNumber;
}
public Function<T, Topic> getTopicFunction() {

View File

@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public abstract class AbstractEnrichMissingDataset extends UpdateMatcher<OaBrokerRelatedDataset> {
public AbstractEnrichMissingDataset(final Topic topic) {
super(true,
super(10,
rel -> topic,
(p, rel) -> p.getDatasets().add(rel),
rel -> rel.getOpenaireId());

View File

@ -12,7 +12,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMissingProject extends UpdateMatcher<OaBrokerProject> {
public EnrichMissingProject() {
super(true,
super(20,
prj -> Topic.ENRICH_MISSING_PROJECT,
(p, prj) -> p.getProjects().add(prj),
prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode());

View File

@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMoreProject extends UpdateMatcher<OaBrokerProject> {
public EnrichMoreProject() {
super(true,
super(20,
prj -> Topic.ENRICH_MORE_PROJECT,
(p, prj) -> p.getProjects().add(prj),
prj -> projectAsString(prj));

View File

@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public abstract class AbstractEnrichMissingPublication extends UpdateMatcher<OaBrokerRelatedPublication> {
public AbstractEnrichMissingPublication(final Topic topic) {
super(true,
super(10,
rel -> topic,
(p, rel) -> p.getPublications().add(rel),
rel -> rel.getOpenaireId());

View File

@ -13,7 +13,7 @@ public class EnrichMissingSoftware
extends UpdateMatcher<OaBrokerRelatedSoftware> {
public EnrichMissingSoftware() {
super(true,
super(10,
s -> Topic.ENRICH_MISSING_SOFTWARE,
(p, s) -> p.getSoftwares().add(s),
s -> s.getOpenaireId());

View File

@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMoreSoftware extends UpdateMatcher<OaBrokerRelatedSoftware> {
public EnrichMoreSoftware() {
super(true,
super(10,
s -> Topic.ENRICH_MORE_SOFTWARE,
(p, s) -> p.getSoftwares().add(s),
s -> s.getOpenaireId());

View File

@ -12,7 +12,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMissingAbstract extends UpdateMatcher<String> {
public EnrichMissingAbstract() {
super(false,
super(1,
s -> Topic.ENRICH_MISSING_ABSTRACT,
(p, s) -> p.getAbstracts().add(s),
s -> s);

View File

@ -15,7 +15,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMissingAuthorOrcid extends UpdateMatcher<OaBrokerAuthor> {
public EnrichMissingAuthorOrcid() {
super(true,
super(40,
aut -> Topic.ENRICH_MISSING_AUTHOR_ORCID,
(p, aut) -> p.getCreators().add(aut),
aut -> aut.getOrcid());

View File

@ -14,7 +14,7 @@ import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
public class EnrichMissingOpenAccess extends UpdateMatcher<OaBrokerInstance> {
public EnrichMissingOpenAccess() {
super(true,
super(20,
i -> Topic.ENRICH_MISSING_OA_VERSION,
(p, i) -> p.getInstances().add(i),
OaBrokerInstance::getUrl);

View File

@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMissingPid extends UpdateMatcher<OaBrokerTypedValue> {
public EnrichMissingPid() {
super(true,
super(10,
pid -> Topic.ENRICH_MISSING_PID,
(p, pid) -> p.getPids().add(pid),
pid -> pid.getType() + "::" + pid.getValue());

View File

@ -12,7 +12,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMissingPublicationDate extends UpdateMatcher<String> {
public EnrichMissingPublicationDate() {
super(false,
super(1,
date -> Topic.ENRICH_MISSING_PUBLICATION_DATE,
(p, date) -> p.setPublicationdate(date),
s -> s);

View File

@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMissingSubject extends UpdateMatcher<OaBrokerTypedValue> {
public EnrichMissingSubject() {
super(true,
super(20,
s -> Topic.fromPath("ENRICH/MISSING/SUBJECT/" + s.getType()),
(p, s) -> p.getSubjects().add(s),
s -> subjectAsString(s));

View File

@ -14,7 +14,7 @@ import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
public class EnrichMoreOpenAccess extends UpdateMatcher<OaBrokerInstance> {
public EnrichMoreOpenAccess() {
super(true,
super(20,
i -> Topic.ENRICH_MORE_OA_VERSION,
(p, i) -> p.getInstances().add(i),
OaBrokerInstance::getUrl);

View File

@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMorePid extends UpdateMatcher<OaBrokerTypedValue> {
public EnrichMorePid() {
super(true,
super(20,
pid -> Topic.ENRICH_MORE_PID,
(p, pid) -> p.getPids().add(pid),
pid -> pidAsString(pid));

View File

@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMoreSubject extends UpdateMatcher<OaBrokerTypedValue> {
public EnrichMoreSubject() {
super(true,
super(20,
s -> Topic.fromPath("ENRICH/MORE/SUBJECT/" + s.getType()),
(p, s) -> p.getSubjects().add(s),
s -> subjectAsString(s));

View File

@ -7,7 +7,16 @@ import java.util.List;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.dhp.broker.model.EventFactory;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMissingProject;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAbstract;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAuthorOrcid;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingOpenAccess;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPid;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingSubject;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreOpenAccess;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMorePid;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSubject;
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup;
import eu.dnetlib.pace.config.DedupConfig;
@ -16,17 +25,17 @@ public class EventFinder {
private static List<UpdateMatcher<?>> matchers = new ArrayList<>();
static {
matchers.add(new EnrichMissingAbstract());
// matchers.add(new EnrichMissingAuthorOrcid());
// matchers.add(new EnrichMissingOpenAccess());
// matchers.add(new EnrichMissingPid());
// matchers.add(new EnrichMissingPublicationDate());
// matchers.add(new EnrichMissingSubject());
// matchers.add(new EnrichMoreOpenAccess());
// matchers.add(new EnrichMorePid());
// matchers.add(new EnrichMoreSubject());
matchers.add(new EnrichMissingAuthorOrcid());
matchers.add(new EnrichMissingOpenAccess());
matchers.add(new EnrichMissingPid());
matchers.add(new EnrichMissingPublicationDate());
matchers.add(new EnrichMissingSubject());
matchers.add(new EnrichMoreOpenAccess());
matchers.add(new EnrichMorePid());
matchers.add(new EnrichMoreSubject());
// // Advanced matchers
// matchers.add(new EnrichMissingProject());
matchers.add(new EnrichMissingProject());
// matchers.add(new EnrichMoreProject());
// matchers.add(new EnrichMissingSoftware());
// matchers.add(new EnrichMoreSoftware());