dnet-hadoop/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java

117 lines
3.6 KiB
Java

package eu.dnetlib.dhp.broker.oa.matchers;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.BiConsumer;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.util.LongAccumulator;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
public abstract class UpdateMatcher<T> {
private final int maxNumber;
private final Function<T, Topic> topicFunction;
private final BiConsumer<OaBrokerMainEntity, T> compileHighlightFunction;
private final Function<T, String> highlightToStringFunction;
public UpdateMatcher(final int maxNumber, final Function<T, Topic> topicFunction,
final BiConsumer<OaBrokerMainEntity, T> compileHighlightFunction,
final Function<T, String> highlightToStringFunction) {
this.maxNumber = maxNumber;
this.topicFunction = topicFunction;
this.compileHighlightFunction = compileHighlightFunction;
this.highlightToStringFunction = highlightToStringFunction;
}
public Collection<UpdateInfo<T>> searchUpdatesForRecord(final OaBrokerMainEntity target,
final OaBrokerRelatedDatasource targetDs,
final Collection<OaBrokerMainEntity> others,
final Map<String, LongAccumulator> accumulators) {
final Map<String, UpdateInfo<T>> infoMap = new HashMap<>();
for (final OaBrokerMainEntity source : others) {
if (source != target) {
for (final T hl : findDifferences(source, target)) {
final Topic topic = getTopicFunction().apply(hl);
if (topic != null) {
final UpdateInfo<T> info = new UpdateInfo<>(topic, hl, source, target, targetDs,
getCompileHighlightFunction(),
getHighlightToStringFunction());
final String s = DigestUtils.md5Hex(info.getHighlightValueAsString());
if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {
infoMap.put(s, info);
}
}
}
}
}
final List<UpdateInfo<T>> values = infoMap
.values()
.stream()
.sorted((o1, o2) -> Float.compare(o2.getTrust(), o1.getTrust())) // DESCENDING
.collect(Collectors.toList());
if (values.isEmpty()) {
return new ArrayList<>();
} else if (values.size() > maxNumber) {
incrementAccumulator(accumulators, maxNumber);
return values.subList(0, maxNumber);
} else {
incrementAccumulator(accumulators, values.size());
return values;
}
}
protected abstract List<T> findDifferences(OaBrokerMainEntity source, OaBrokerMainEntity target);
protected static boolean isMissing(final List<String> list) {
return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0));
}
protected boolean isMissing(final String s) {
return StringUtils.isBlank(s);
}
public int getMaxNumber() {
return maxNumber;
}
public Function<T, Topic> getTopicFunction() {
return topicFunction;
}
public BiConsumer<OaBrokerMainEntity, T> getCompileHighlightFunction() {
return compileHighlightFunction;
}
public Function<T, String> getHighlightToStringFunction() {
return highlightToStringFunction;
}
public String accumulatorName() {
return "event_matcher_" + getClass().getSimpleName().toLowerCase();
}
public void incrementAccumulator(final Map<String, LongAccumulator> accumulators, final long n) {
if (accumulators != null && accumulators.containsKey(accumulatorName())) {
accumulators.get(accumulatorName()).add(n);
}
}
}