dnet-hadoop/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/IdGenerator.java

61 lines
1.8 KiB
Java
Raw Normal View History

package eu.dnetlib.dhp.oa.dedup;
import static eu.dnetlib.dhp.utils.DHPUtils.md5;
import static org.apache.commons.lang3.StringUtils.substringAfter;
import static org.apache.commons.lang3.StringUtils.substringBefore;
import java.io.Serializable;
import java.util.List;
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
public class IdGenerator implements Serializable {
// pick the best pid from the list (consider date and pidtype)
public static <T extends OafEntity> String generate(List<? extends Identifier> pids, String defaultID) {
2021-08-11 12:13:22 +02:00
if (pids == null || pids.isEmpty())
return defaultID;
2022-11-09 14:20:59 +01:00
return generateId(pids);
}
private static String generateId(List<? extends Identifier> pids) {
Identifier bp = pids
.stream()
.min(Identifier::compareTo)
2021-08-11 12:13:22 +02:00
.orElseThrow(() -> new IllegalStateException("unable to generate id"));
return generate(bp.getOriginalID());
}
public static String generate(String originalId) {
String prefix = substringBefore(originalId, "|");
String ns = substringBefore(substringAfter(originalId, "|"), "::");
String suffix = substringAfter(originalId, "::");
final String pidType = substringBefore(ns, "_");
if (PidType.isValid(pidType)) {
return prefix + "|" + dedupify(ns) + "::" + suffix;
} else {
return prefix + "|dedup_wf_002::" + md5(originalId); // hash the whole originalId to avoid collisions
}
}
private static String dedupify(String ns) {
2021-03-29 10:07:12 +02:00
StringBuilder prefix;
if (PidType.valueOf(substringBefore(ns, "_")) == PidType.openorgs) {
prefix = new StringBuilder(substringBefore(ns, "_"));
} else {
prefix = new StringBuilder(substringBefore(ns, "_")).append("_dedup");
}
while (prefix.length() < 12) {
prefix.append("_");
}
return prefix.substring(0, 12);
}
}