package eu.dnetlib.dhp.oa.dedup; import static java.util.Collections.reverseOrder; import static java.util.Map.Entry.comparingByValue; import static java.util.stream.Collectors.toMap; import static org.apache.commons.lang.StringUtils.endsWith; import static org.apache.commons.lang.StringUtils.substringBefore; import java.time.Year; import java.util.*; import java.util.stream.Collectors; import org.apache.commons.lang.StringUtils; import eu.dnetlib.dhp.schema.oaf.Field; public class DatePicker { public static final String DATE_PATTERN = "^(\\d{4})-(\\d{2})-(\\d{2})"; private static final String DATE_DEFAULT_SUFFIX = "01-01"; private static final int YEAR_LB = 1300; private static final int YEAR_UB = Year.now().getValue() + 5; public static Field pick(final Collection dateofacceptance) { final Map frequencies = dateofacceptance .parallelStream() .filter(StringUtils::isNotBlank) .map(d -> substringBefore(d, "T")) .collect(Collectors.toConcurrentMap(w -> w, w -> 1, Integer::sum)); if (frequencies.isEmpty()) { return new Field<>(); } final Field date = new Field<>(); date.setValue(frequencies.keySet().iterator().next()); // let's sort this map by values first, filtering out invalid dates final Map sorted = frequencies .entrySet() .stream() .filter(d -> StringUtils.isNotBlank(d.getKey())) .filter(d -> d.getKey().matches(DATE_PATTERN)) .filter(d -> inRange(d.getKey())) .sorted(reverseOrder(comparingByValue())) .collect( toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new)); // shortcut if (sorted.size() == 0) { return date; } // voting method (1/3 + 1) wins if (sorted.size() >= 3) { final int acceptThreshold = (sorted.size() / 3) + 1; final List accepted = sorted .entrySet() .stream() .filter(e -> e.getValue() >= acceptThreshold) .map(e -> e.getKey()) .collect(Collectors.toList()); // cannot find strong majority if (accepted.isEmpty()) { final int max = sorted.values().iterator().next(); Optional first = sorted .entrySet() .stream() .filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) .map(Map.Entry::getKey) .findFirst(); if (first.isPresent()) { date.setValue(first.get()); return date; } date.setValue(sorted.keySet().iterator().next()); return date; } if (accepted.size() == 1) { date.setValue(accepted.get(0)); return date; } else { final Optional first = accepted .stream() .filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX)) .findFirst(); if (first.isPresent()) { date.setValue(first.get()); return date; } return date; } // 1st non YYYY-01-01 is returned } else { if (sorted.size() == 2) { for (Map.Entry e : sorted.entrySet()) { if (!endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) { date.setValue(e.getKey()); return date; } } } // none of the dates seems good enough, return the 1st one date.setValue(sorted.keySet().iterator().next()); return date; } } public static boolean inRange(final String date) { final int year = Integer.parseInt(substringBefore(date, "-")); return year >= YEAR_LB && year <= YEAR_UB; } }