dnet-hadoop/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java

70 lines
2.0 KiB
Java
Raw Normal View History

2020-05-13 12:00:27 +02:00
2020-05-15 12:25:37 +02:00
package eu.dnetlib.dhp.broker.oa.matchers;
2020-05-13 12:00:27 +02:00
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
2020-05-15 12:25:37 +02:00
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
2020-06-11 11:25:18 +02:00
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
2020-05-13 12:00:27 +02:00
import eu.dnetlib.dhp.schema.oaf.Field;
2020-06-09 16:01:31 +02:00
import eu.dnetlib.pace.config.DedupConfig;
2020-05-13 12:00:27 +02:00
2020-06-11 11:25:18 +02:00
public abstract class UpdateMatcher<T> {
2020-05-13 12:00:27 +02:00
private final boolean multipleUpdate;
public UpdateMatcher(final boolean multipleUpdate) {
this.multipleUpdate = multipleUpdate;
}
2020-06-11 11:25:18 +02:00
public Collection<UpdateInfo<T>> searchUpdatesForRecord(final ResultWithRelations res,
final Collection<ResultWithRelations> others,
2020-06-10 12:11:16 +02:00
final DedupConfig dedupConfig) {
2020-05-13 12:00:27 +02:00
final Map<String, UpdateInfo<T>> infoMap = new HashMap<>();
2020-06-11 11:25:18 +02:00
for (final ResultWithRelations source : others) {
2020-05-13 12:00:27 +02:00
if (source != res) {
2020-06-09 16:01:31 +02:00
for (final UpdateInfo<T> info : findUpdates(source, res, dedupConfig)) {
2020-05-13 12:00:27 +02:00
final String s = DigestUtils.md5Hex(info.getHighlightValueAsString());
2020-06-10 12:11:16 +02:00
if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {
} else {
2020-05-13 12:00:27 +02:00
infoMap.put(s, info);
}
}
}
}
final Collection<UpdateInfo<T>> values = infoMap.values();
if (values.isEmpty() || multipleUpdate) {
return values;
} else {
final UpdateInfo<T> v = values
.stream()
.sorted((o1, o2) -> Float.compare(o1.getTrust(), o2.getTrust()))
.findFirst()
.get();
return Arrays.asList(v);
}
}
2020-06-11 11:25:18 +02:00
protected abstract List<UpdateInfo<T>> findUpdates(ResultWithRelations source, ResultWithRelations target,
DedupConfig dedupConfig);
2020-05-13 12:00:27 +02:00
protected static boolean isMissing(final List<Field<String>> list) {
return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0).getValue());
}
protected boolean isMissing(final Field<String> field) {
return field == null || StringUtils.isBlank(field.getValue());
}
2020-05-13 12:00:27 +02:00
}