dnet-hadoop/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java

117 lines
3.6 KiB
Java
Raw Normal View History

2020-05-13 12:00:27 +02:00
2020-05-15 12:25:37 +02:00
package eu.dnetlib.dhp.broker.oa.matchers;
2020-05-13 12:00:27 +02:00
2020-06-26 11:20:45 +02:00
import java.util.ArrayList;
2020-05-13 12:00:27 +02:00
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
2020-06-12 09:47:55 +02:00
import java.util.function.BiConsumer;
import java.util.function.Function;
2020-06-26 11:20:45 +02:00
import java.util.stream.Collectors;
2020-05-13 12:00:27 +02:00
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
2020-06-29 16:33:32 +02:00
import org.apache.spark.util.LongAccumulator;
2020-05-13 12:00:27 +02:00
2020-06-22 08:51:31 +02:00
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
2020-07-15 09:18:40 +02:00
import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource;
2020-06-12 09:47:55 +02:00
import eu.dnetlib.dhp.broker.model.Topic;
2020-05-15 12:25:37 +02:00
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
2020-05-13 12:00:27 +02:00
2020-06-11 11:25:18 +02:00
public abstract class UpdateMatcher<T> {
2020-05-13 12:00:27 +02:00
2020-06-26 11:20:45 +02:00
private final int maxNumber;
2020-06-12 09:47:55 +02:00
private final Function<T, Topic> topicFunction;
2020-06-22 08:51:31 +02:00
private final BiConsumer<OaBrokerMainEntity, T> compileHighlightFunction;
2020-06-12 09:47:55 +02:00
private final Function<T, String> highlightToStringFunction;
2020-05-13 12:00:27 +02:00
2020-06-26 11:20:45 +02:00
public UpdateMatcher(final int maxNumber, final Function<T, Topic> topicFunction,
2020-06-22 08:51:31 +02:00
final BiConsumer<OaBrokerMainEntity, T> compileHighlightFunction,
2020-06-12 09:47:55 +02:00
final Function<T, String> highlightToStringFunction) {
2020-06-26 11:20:45 +02:00
this.maxNumber = maxNumber;
2020-06-12 09:47:55 +02:00
this.topicFunction = topicFunction;
this.compileHighlightFunction = compileHighlightFunction;
this.highlightToStringFunction = highlightToStringFunction;
2020-05-13 12:00:27 +02:00
}
2020-07-15 09:18:40 +02:00
public Collection<UpdateInfo<T>> searchUpdatesForRecord(final OaBrokerMainEntity target,
final OaBrokerRelatedDatasource targetDs,
2020-06-22 08:51:31 +02:00
final Collection<OaBrokerMainEntity> others,
2020-06-29 16:33:32 +02:00
final Map<String, LongAccumulator> accumulators) {
2020-05-13 12:00:27 +02:00
final Map<String, UpdateInfo<T>> infoMap = new HashMap<>();
2020-06-22 08:51:31 +02:00
for (final OaBrokerMainEntity source : others) {
2020-07-15 09:18:40 +02:00
if (source != target) {
for (final T hl : findDifferences(source, target)) {
2020-06-12 09:47:55 +02:00
final Topic topic = getTopicFunction().apply(hl);
2020-06-25 13:01:09 +02:00
if (topic != null) {
2020-07-15 09:18:40 +02:00
final UpdateInfo<T> info = new UpdateInfo<>(topic, hl, source, target, targetDs,
2020-06-25 13:01:09 +02:00
getCompileHighlightFunction(),
2020-07-09 12:53:46 +02:00
getHighlightToStringFunction());
2020-06-25 13:01:09 +02:00
final String s = DigestUtils.md5Hex(info.getHighlightValueAsString());
if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {
infoMap.put(s, info);
}
2020-05-13 12:00:27 +02:00
}
}
}
}
2020-06-26 11:20:45 +02:00
final List<UpdateInfo<T>> values = infoMap
.values()
.stream()
.sorted((o1, o2) -> Float.compare(o2.getTrust(), o1.getTrust())) // DESCENDING
.collect(Collectors.toList());
if (values.isEmpty()) {
return new ArrayList<>();
} else if (values.size() > maxNumber) {
2020-06-29 16:33:32 +02:00
incrementAccumulator(accumulators, maxNumber);
2020-06-26 11:20:45 +02:00
return values.subList(0, maxNumber);
2020-05-13 12:00:27 +02:00
} else {
2020-06-29 16:33:32 +02:00
incrementAccumulator(accumulators, values.size());
2020-06-26 11:20:45 +02:00
return values;
2020-05-13 12:00:27 +02:00
}
}
2020-06-22 08:51:31 +02:00
protected abstract List<T> findDifferences(OaBrokerMainEntity source, OaBrokerMainEntity target);
2020-05-13 12:00:27 +02:00
2020-06-16 12:34:13 +02:00
protected static boolean isMissing(final List<String> list) {
return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0));
2020-05-13 12:00:27 +02:00
}
2020-06-30 16:17:09 +02:00
protected boolean isMissing(final String s) {
return StringUtils.isBlank(s);
}
2020-06-26 11:20:45 +02:00
public int getMaxNumber() {
return maxNumber;
2020-06-12 09:47:55 +02:00
}
public Function<T, Topic> getTopicFunction() {
return topicFunction;
}
2020-06-22 08:51:31 +02:00
public BiConsumer<OaBrokerMainEntity, T> getCompileHighlightFunction() {
2020-06-12 09:47:55 +02:00
return compileHighlightFunction;
}
public Function<T, String> getHighlightToStringFunction() {
return highlightToStringFunction;
}
2020-06-29 16:33:32 +02:00
public String accumulatorName() {
return "event_matcher_" + getClass().getSimpleName().toLowerCase();
}
public void incrementAccumulator(final Map<String, LongAccumulator> accumulators, final long n) {
2020-06-30 16:17:09 +02:00
if (accumulators != null && accumulators.containsKey(accumulatorName())) {
2020-06-29 16:33:32 +02:00
accumulators.get(accumulatorName()).add(n);
}
}
2020-05-13 12:00:27 +02:00
}