forked from D-Net/dnet-hadoop
limits
This commit is contained in:
parent
408165a756
commit
2393d9da2f
|
@ -1,13 +1,14 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.matchers;
|
package eu.dnetlib.dhp.broker.oa.matchers;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.function.BiConsumer;
|
import java.util.function.BiConsumer;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.codec.digest.DigestUtils;
|
import org.apache.commons.codec.digest.DigestUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
@ -19,15 +20,15 @@ import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
|
||||||
public abstract class UpdateMatcher<T> {
|
public abstract class UpdateMatcher<T> {
|
||||||
|
|
||||||
private final boolean multipleUpdate;
|
private final int maxNumber;
|
||||||
private final Function<T, Topic> topicFunction;
|
private final Function<T, Topic> topicFunction;
|
||||||
private final BiConsumer<OaBrokerMainEntity, T> compileHighlightFunction;
|
private final BiConsumer<OaBrokerMainEntity, T> compileHighlightFunction;
|
||||||
private final Function<T, String> highlightToStringFunction;
|
private final Function<T, String> highlightToStringFunction;
|
||||||
|
|
||||||
public UpdateMatcher(final boolean multipleUpdate, final Function<T, Topic> topicFunction,
|
public UpdateMatcher(final int maxNumber, final Function<T, Topic> topicFunction,
|
||||||
final BiConsumer<OaBrokerMainEntity, T> compileHighlightFunction,
|
final BiConsumer<OaBrokerMainEntity, T> compileHighlightFunction,
|
||||||
final Function<T, String> highlightToStringFunction) {
|
final Function<T, String> highlightToStringFunction) {
|
||||||
this.multipleUpdate = multipleUpdate;
|
this.maxNumber = maxNumber;
|
||||||
this.topicFunction = topicFunction;
|
this.topicFunction = topicFunction;
|
||||||
this.compileHighlightFunction = compileHighlightFunction;
|
this.compileHighlightFunction = compileHighlightFunction;
|
||||||
this.highlightToStringFunction = highlightToStringFunction;
|
this.highlightToStringFunction = highlightToStringFunction;
|
||||||
|
@ -57,17 +58,19 @@ public abstract class UpdateMatcher<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final Collection<UpdateInfo<T>> values = infoMap.values();
|
final List<UpdateInfo<T>> values = infoMap
|
||||||
|
.values()
|
||||||
|
.stream()
|
||||||
|
.sorted((o1, o2) -> Float.compare(o2.getTrust(), o1.getTrust())) // DESCENDING
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
if (values.isEmpty() || multipleUpdate) {
|
if (values.isEmpty()) {
|
||||||
return values;
|
return new ArrayList<>();
|
||||||
|
} else if (values.size() > maxNumber) {
|
||||||
|
System.err.println("Too many events (" + values.size() + ") matched by " + getClass().getSimpleName());
|
||||||
|
return values.subList(0, maxNumber);
|
||||||
} else {
|
} else {
|
||||||
final UpdateInfo<T> v = values
|
return values;
|
||||||
.stream()
|
|
||||||
.sorted((o1, o2) -> Float.compare(o1.getTrust(), o2.getTrust()))
|
|
||||||
.findFirst()
|
|
||||||
.get();
|
|
||||||
return Arrays.asList(v);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -81,8 +84,8 @@ public abstract class UpdateMatcher<T> {
|
||||||
return StringUtils.isBlank(field);
|
return StringUtils.isBlank(field);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isMultipleUpdate() {
|
public int getMaxNumber() {
|
||||||
return multipleUpdate;
|
return maxNumber;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Function<T, Topic> getTopicFunction() {
|
public Function<T, Topic> getTopicFunction() {
|
||||||
|
|
|
@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
public abstract class AbstractEnrichMissingDataset extends UpdateMatcher<OaBrokerRelatedDataset> {
|
public abstract class AbstractEnrichMissingDataset extends UpdateMatcher<OaBrokerRelatedDataset> {
|
||||||
|
|
||||||
public AbstractEnrichMissingDataset(final Topic topic) {
|
public AbstractEnrichMissingDataset(final Topic topic) {
|
||||||
super(true,
|
super(10,
|
||||||
rel -> topic,
|
rel -> topic,
|
||||||
(p, rel) -> p.getDatasets().add(rel),
|
(p, rel) -> p.getDatasets().add(rel),
|
||||||
rel -> rel.getOpenaireId());
|
rel -> rel.getOpenaireId());
|
||||||
|
|
|
@ -12,7 +12,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
public class EnrichMissingProject extends UpdateMatcher<OaBrokerProject> {
|
public class EnrichMissingProject extends UpdateMatcher<OaBrokerProject> {
|
||||||
|
|
||||||
public EnrichMissingProject() {
|
public EnrichMissingProject() {
|
||||||
super(true,
|
super(20,
|
||||||
prj -> Topic.ENRICH_MISSING_PROJECT,
|
prj -> Topic.ENRICH_MISSING_PROJECT,
|
||||||
(p, prj) -> p.getProjects().add(prj),
|
(p, prj) -> p.getProjects().add(prj),
|
||||||
prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode());
|
prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode());
|
||||||
|
|
|
@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
public class EnrichMoreProject extends UpdateMatcher<OaBrokerProject> {
|
public class EnrichMoreProject extends UpdateMatcher<OaBrokerProject> {
|
||||||
|
|
||||||
public EnrichMoreProject() {
|
public EnrichMoreProject() {
|
||||||
super(true,
|
super(20,
|
||||||
prj -> Topic.ENRICH_MORE_PROJECT,
|
prj -> Topic.ENRICH_MORE_PROJECT,
|
||||||
(p, prj) -> p.getProjects().add(prj),
|
(p, prj) -> p.getProjects().add(prj),
|
||||||
prj -> projectAsString(prj));
|
prj -> projectAsString(prj));
|
||||||
|
|
|
@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
public abstract class AbstractEnrichMissingPublication extends UpdateMatcher<OaBrokerRelatedPublication> {
|
public abstract class AbstractEnrichMissingPublication extends UpdateMatcher<OaBrokerRelatedPublication> {
|
||||||
|
|
||||||
public AbstractEnrichMissingPublication(final Topic topic) {
|
public AbstractEnrichMissingPublication(final Topic topic) {
|
||||||
super(true,
|
super(10,
|
||||||
rel -> topic,
|
rel -> topic,
|
||||||
(p, rel) -> p.getPublications().add(rel),
|
(p, rel) -> p.getPublications().add(rel),
|
||||||
rel -> rel.getOpenaireId());
|
rel -> rel.getOpenaireId());
|
||||||
|
|
|
@ -13,7 +13,7 @@ public class EnrichMissingSoftware
|
||||||
extends UpdateMatcher<OaBrokerRelatedSoftware> {
|
extends UpdateMatcher<OaBrokerRelatedSoftware> {
|
||||||
|
|
||||||
public EnrichMissingSoftware() {
|
public EnrichMissingSoftware() {
|
||||||
super(true,
|
super(10,
|
||||||
s -> Topic.ENRICH_MISSING_SOFTWARE,
|
s -> Topic.ENRICH_MISSING_SOFTWARE,
|
||||||
(p, s) -> p.getSoftwares().add(s),
|
(p, s) -> p.getSoftwares().add(s),
|
||||||
s -> s.getOpenaireId());
|
s -> s.getOpenaireId());
|
||||||
|
|
|
@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
public class EnrichMoreSoftware extends UpdateMatcher<OaBrokerRelatedSoftware> {
|
public class EnrichMoreSoftware extends UpdateMatcher<OaBrokerRelatedSoftware> {
|
||||||
|
|
||||||
public EnrichMoreSoftware() {
|
public EnrichMoreSoftware() {
|
||||||
super(true,
|
super(10,
|
||||||
s -> Topic.ENRICH_MORE_SOFTWARE,
|
s -> Topic.ENRICH_MORE_SOFTWARE,
|
||||||
(p, s) -> p.getSoftwares().add(s),
|
(p, s) -> p.getSoftwares().add(s),
|
||||||
s -> s.getOpenaireId());
|
s -> s.getOpenaireId());
|
||||||
|
|
|
@ -12,7 +12,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
public class EnrichMissingAbstract extends UpdateMatcher<String> {
|
public class EnrichMissingAbstract extends UpdateMatcher<String> {
|
||||||
|
|
||||||
public EnrichMissingAbstract() {
|
public EnrichMissingAbstract() {
|
||||||
super(false,
|
super(1,
|
||||||
s -> Topic.ENRICH_MISSING_ABSTRACT,
|
s -> Topic.ENRICH_MISSING_ABSTRACT,
|
||||||
(p, s) -> p.getAbstracts().add(s),
|
(p, s) -> p.getAbstracts().add(s),
|
||||||
s -> s);
|
s -> s);
|
||||||
|
|
|
@ -15,7 +15,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
public class EnrichMissingAuthorOrcid extends UpdateMatcher<OaBrokerAuthor> {
|
public class EnrichMissingAuthorOrcid extends UpdateMatcher<OaBrokerAuthor> {
|
||||||
|
|
||||||
public EnrichMissingAuthorOrcid() {
|
public EnrichMissingAuthorOrcid() {
|
||||||
super(true,
|
super(40,
|
||||||
aut -> Topic.ENRICH_MISSING_AUTHOR_ORCID,
|
aut -> Topic.ENRICH_MISSING_AUTHOR_ORCID,
|
||||||
(p, aut) -> p.getCreators().add(aut),
|
(p, aut) -> p.getCreators().add(aut),
|
||||||
aut -> aut.getOrcid());
|
aut -> aut.getOrcid());
|
||||||
|
|
|
@ -14,7 +14,7 @@ import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
|
||||||
public class EnrichMissingOpenAccess extends UpdateMatcher<OaBrokerInstance> {
|
public class EnrichMissingOpenAccess extends UpdateMatcher<OaBrokerInstance> {
|
||||||
|
|
||||||
public EnrichMissingOpenAccess() {
|
public EnrichMissingOpenAccess() {
|
||||||
super(true,
|
super(20,
|
||||||
i -> Topic.ENRICH_MISSING_OA_VERSION,
|
i -> Topic.ENRICH_MISSING_OA_VERSION,
|
||||||
(p, i) -> p.getInstances().add(i),
|
(p, i) -> p.getInstances().add(i),
|
||||||
OaBrokerInstance::getUrl);
|
OaBrokerInstance::getUrl);
|
||||||
|
|
|
@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
public class EnrichMissingPid extends UpdateMatcher<OaBrokerTypedValue> {
|
public class EnrichMissingPid extends UpdateMatcher<OaBrokerTypedValue> {
|
||||||
|
|
||||||
public EnrichMissingPid() {
|
public EnrichMissingPid() {
|
||||||
super(true,
|
super(10,
|
||||||
pid -> Topic.ENRICH_MISSING_PID,
|
pid -> Topic.ENRICH_MISSING_PID,
|
||||||
(p, pid) -> p.getPids().add(pid),
|
(p, pid) -> p.getPids().add(pid),
|
||||||
pid -> pid.getType() + "::" + pid.getValue());
|
pid -> pid.getType() + "::" + pid.getValue());
|
||||||
|
|
|
@ -12,7 +12,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
public class EnrichMissingPublicationDate extends UpdateMatcher<String> {
|
public class EnrichMissingPublicationDate extends UpdateMatcher<String> {
|
||||||
|
|
||||||
public EnrichMissingPublicationDate() {
|
public EnrichMissingPublicationDate() {
|
||||||
super(false,
|
super(1,
|
||||||
date -> Topic.ENRICH_MISSING_PUBLICATION_DATE,
|
date -> Topic.ENRICH_MISSING_PUBLICATION_DATE,
|
||||||
(p, date) -> p.setPublicationdate(date),
|
(p, date) -> p.setPublicationdate(date),
|
||||||
s -> s);
|
s -> s);
|
||||||
|
|
|
@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
public class EnrichMissingSubject extends UpdateMatcher<OaBrokerTypedValue> {
|
public class EnrichMissingSubject extends UpdateMatcher<OaBrokerTypedValue> {
|
||||||
|
|
||||||
public EnrichMissingSubject() {
|
public EnrichMissingSubject() {
|
||||||
super(true,
|
super(20,
|
||||||
s -> Topic.fromPath("ENRICH/MISSING/SUBJECT/" + s.getType()),
|
s -> Topic.fromPath("ENRICH/MISSING/SUBJECT/" + s.getType()),
|
||||||
(p, s) -> p.getSubjects().add(s),
|
(p, s) -> p.getSubjects().add(s),
|
||||||
s -> subjectAsString(s));
|
s -> subjectAsString(s));
|
||||||
|
|
|
@ -14,7 +14,7 @@ import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
|
||||||
public class EnrichMoreOpenAccess extends UpdateMatcher<OaBrokerInstance> {
|
public class EnrichMoreOpenAccess extends UpdateMatcher<OaBrokerInstance> {
|
||||||
|
|
||||||
public EnrichMoreOpenAccess() {
|
public EnrichMoreOpenAccess() {
|
||||||
super(true,
|
super(20,
|
||||||
i -> Topic.ENRICH_MORE_OA_VERSION,
|
i -> Topic.ENRICH_MORE_OA_VERSION,
|
||||||
(p, i) -> p.getInstances().add(i),
|
(p, i) -> p.getInstances().add(i),
|
||||||
OaBrokerInstance::getUrl);
|
OaBrokerInstance::getUrl);
|
||||||
|
|
|
@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
public class EnrichMorePid extends UpdateMatcher<OaBrokerTypedValue> {
|
public class EnrichMorePid extends UpdateMatcher<OaBrokerTypedValue> {
|
||||||
|
|
||||||
public EnrichMorePid() {
|
public EnrichMorePid() {
|
||||||
super(true,
|
super(20,
|
||||||
pid -> Topic.ENRICH_MORE_PID,
|
pid -> Topic.ENRICH_MORE_PID,
|
||||||
(p, pid) -> p.getPids().add(pid),
|
(p, pid) -> p.getPids().add(pid),
|
||||||
pid -> pidAsString(pid));
|
pid -> pidAsString(pid));
|
||||||
|
|
|
@ -13,7 +13,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
public class EnrichMoreSubject extends UpdateMatcher<OaBrokerTypedValue> {
|
public class EnrichMoreSubject extends UpdateMatcher<OaBrokerTypedValue> {
|
||||||
|
|
||||||
public EnrichMoreSubject() {
|
public EnrichMoreSubject() {
|
||||||
super(true,
|
super(20,
|
||||||
s -> Topic.fromPath("ENRICH/MORE/SUBJECT/" + s.getType()),
|
s -> Topic.fromPath("ENRICH/MORE/SUBJECT/" + s.getType()),
|
||||||
(p, s) -> p.getSubjects().add(s),
|
(p, s) -> p.getSubjects().add(s),
|
||||||
s -> subjectAsString(s));
|
s -> subjectAsString(s));
|
||||||
|
|
|
@ -7,7 +7,16 @@ import java.util.List;
|
||||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||||
import eu.dnetlib.dhp.broker.model.EventFactory;
|
import eu.dnetlib.dhp.broker.model.EventFactory;
|
||||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMissingProject;
|
||||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAbstract;
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAbstract;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAuthorOrcid;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingOpenAccess;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPid;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingSubject;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreOpenAccess;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMorePid;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSubject;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup;
|
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
|
||||||
|
@ -16,17 +25,17 @@ public class EventFinder {
|
||||||
private static List<UpdateMatcher<?>> matchers = new ArrayList<>();
|
private static List<UpdateMatcher<?>> matchers = new ArrayList<>();
|
||||||
static {
|
static {
|
||||||
matchers.add(new EnrichMissingAbstract());
|
matchers.add(new EnrichMissingAbstract());
|
||||||
// matchers.add(new EnrichMissingAuthorOrcid());
|
matchers.add(new EnrichMissingAuthorOrcid());
|
||||||
// matchers.add(new EnrichMissingOpenAccess());
|
matchers.add(new EnrichMissingOpenAccess());
|
||||||
// matchers.add(new EnrichMissingPid());
|
matchers.add(new EnrichMissingPid());
|
||||||
// matchers.add(new EnrichMissingPublicationDate());
|
matchers.add(new EnrichMissingPublicationDate());
|
||||||
// matchers.add(new EnrichMissingSubject());
|
matchers.add(new EnrichMissingSubject());
|
||||||
// matchers.add(new EnrichMoreOpenAccess());
|
matchers.add(new EnrichMoreOpenAccess());
|
||||||
// matchers.add(new EnrichMorePid());
|
matchers.add(new EnrichMorePid());
|
||||||
// matchers.add(new EnrichMoreSubject());
|
matchers.add(new EnrichMoreSubject());
|
||||||
|
|
||||||
// // Advanced matchers
|
// // Advanced matchers
|
||||||
// matchers.add(new EnrichMissingProject());
|
matchers.add(new EnrichMissingProject());
|
||||||
// matchers.add(new EnrichMoreProject());
|
// matchers.add(new EnrichMoreProject());
|
||||||
// matchers.add(new EnrichMissingSoftware());
|
// matchers.add(new EnrichMissingSoftware());
|
||||||
// matchers.add(new EnrichMoreSoftware());
|
// matchers.add(new EnrichMoreSoftware());
|
||||||
|
|
Loading…
Reference in New Issue