2020-03-27 13:48:44 +01:00
|
|
|
package eu.dnetlib.dedup;
|
|
|
|
|
|
|
|
import static java.util.Collections.reverseOrder;
|
|
|
|
import static java.util.Map.Entry.comparingByValue;
|
|
|
|
import static java.util.stream.Collectors.toMap;
|
|
|
|
import static org.apache.commons.lang.StringUtils.endsWith;
|
|
|
|
import static org.apache.commons.lang.StringUtils.substringBefore;
|
|
|
|
|
2020-04-18 12:42:58 +02:00
|
|
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
|
|
|
import java.time.Year;
|
|
|
|
import java.util.*;
|
|
|
|
import java.util.stream.Collectors;
|
|
|
|
import org.apache.commons.lang.StringUtils;
|
|
|
|
|
2020-03-27 13:48:44 +01:00
|
|
|
public class DatePicker {
|
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}";
|
|
|
|
private static final String DATE_DEFAULT_SUFFIX = "01-01";
|
|
|
|
private static final int YEAR_LB = 1300;
|
|
|
|
private static final int YEAR_UB = Year.now().getValue() + 5;
|
2020-03-27 13:48:44 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
public static Field<String> pick(final Collection<String> dateofacceptance) {
|
2020-03-27 13:48:44 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
final Map<String, Integer> frequencies =
|
|
|
|
dateofacceptance
|
|
|
|
.parallelStream()
|
|
|
|
.filter(StringUtils::isNotBlank)
|
|
|
|
.collect(Collectors.toConcurrentMap(w -> w, w -> 1, Integer::sum));
|
2020-03-27 13:48:44 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
if (frequencies.isEmpty()) {
|
|
|
|
return new Field<>();
|
|
|
|
}
|
|
|
|
|
|
|
|
final Field<String> date = new Field<>();
|
|
|
|
date.setValue(frequencies.keySet().iterator().next());
|
|
|
|
|
|
|
|
// let's sort this map by values first, filtering out invalid dates
|
|
|
|
final Map<String, Integer> sorted =
|
|
|
|
frequencies.entrySet().stream()
|
|
|
|
.filter(d -> StringUtils.isNotBlank(d.getKey()))
|
|
|
|
.filter(d -> d.getKey().matches(DATE_PATTERN))
|
|
|
|
.filter(d -> inRange(d.getKey()))
|
|
|
|
.sorted(reverseOrder(comparingByValue()))
|
|
|
|
.collect(
|
|
|
|
toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new));
|
|
|
|
|
|
|
|
// shortcut
|
|
|
|
if (sorted.size() == 0) {
|
|
|
|
return date;
|
|
|
|
}
|
|
|
|
|
|
|
|
// voting method (1/3 + 1) wins
|
|
|
|
if (sorted.size() >= 3) {
|
|
|
|
final int acceptThreshold = (sorted.size() / 3) + 1;
|
|
|
|
final List<String> accepted =
|
|
|
|
sorted.entrySet().stream()
|
|
|
|
.filter(e -> e.getValue() >= acceptThreshold)
|
|
|
|
.map(e -> e.getKey())
|
|
|
|
.collect(Collectors.toList());
|
|
|
|
|
|
|
|
// cannot find strong majority
|
|
|
|
if (accepted.isEmpty()) {
|
|
|
|
final int max = sorted.values().iterator().next();
|
|
|
|
Optional<String> first =
|
|
|
|
sorted.entrySet().stream()
|
|
|
|
.filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX))
|
|
|
|
.map(Map.Entry::getKey)
|
|
|
|
.findFirst();
|
|
|
|
if (first.isPresent()) {
|
|
|
|
date.setValue(first.get());
|
|
|
|
return date;
|
2020-03-27 13:48:44 +01:00
|
|
|
}
|
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
date.setValue(sorted.keySet().iterator().next());
|
|
|
|
return date;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (accepted.size() == 1) {
|
|
|
|
date.setValue(accepted.get(0));
|
|
|
|
return date;
|
|
|
|
} else {
|
|
|
|
final Optional<String> first =
|
|
|
|
accepted.stream().filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX)).findFirst();
|
|
|
|
if (first.isPresent()) {
|
|
|
|
date.setValue(first.get());
|
|
|
|
return date;
|
2020-03-27 13:48:44 +01:00
|
|
|
}
|
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
return date;
|
|
|
|
}
|
|
|
|
|
|
|
|
// 1st non YYYY-01-01 is returned
|
|
|
|
} else {
|
|
|
|
if (sorted.size() == 2) {
|
|
|
|
for (Map.Entry<String, Integer> e : sorted.entrySet()) {
|
|
|
|
if (!endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) {
|
|
|
|
date.setValue(e.getKey());
|
2020-03-27 13:48:44 +01:00
|
|
|
return date;
|
2020-04-27 14:45:40 +02:00
|
|
|
}
|
2020-03-27 13:48:44 +01:00
|
|
|
}
|
2020-04-27 14:45:40 +02:00
|
|
|
}
|
2020-03-27 13:48:44 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
// none of the dates seems good enough, return the 1st one
|
|
|
|
date.setValue(sorted.keySet().iterator().next());
|
|
|
|
return date;
|
2020-03-27 13:48:44 +01:00
|
|
|
}
|
2020-04-27 14:45:40 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
private static boolean inRange(final String date) {
|
|
|
|
final int year = Integer.parseInt(substringBefore(date, "-"));
|
|
|
|
return year >= YEAR_LB && year <= YEAR_UB;
|
|
|
|
}
|
2020-04-18 12:42:58 +02:00
|
|
|
}
|