package eu.dnetlib.dhp.schema.oaf.utils; import static com.google.common.base.Preconditions.checkArgument; import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import java.io.Serializable; import java.nio.charset.StandardCharsets; import java.security.MessageDigest; import java.util.*; import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.commons.codec.binary.Hex; import org.apache.commons.lang3.StringUtils; import com.google.common.collect.HashBiMap; import com.google.common.collect.Maps; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; /** * Factory class for OpenAIRE identifiers in the Graph */ public class IdentifierFactory implements Serializable { public static final String ID_SEPARATOR = "::"; public static final String ID_PREFIX_SEPARATOR = "|"; public static final int ID_PREFIX_LEN = 12; /** * Declares the associations PID_TYPE -> [DATASOURCE ID, NAME] considered authoritative for that PID_TYPE */ public static final Map> PID_AUTHORITY = Maps.newHashMap(); static { PID_AUTHORITY.put(PidType.doi, HashBiMap.create()); PID_AUTHORITY.get(PidType.doi).put(CROSSREF_ID, "Crossref"); PID_AUTHORITY.get(PidType.doi).put(DATACITE_ID, "Datacite"); PID_AUTHORITY.put(PidType.pmc, HashBiMap.create()); PID_AUTHORITY.get(PidType.pmc).put(EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central"); PID_AUTHORITY.get(PidType.pmc).put(PUBMED_CENTRAL_ID, "PubMed Central"); PID_AUTHORITY.put(PidType.pmid, HashBiMap.create()); PID_AUTHORITY.get(PidType.pmid).put(EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central"); PID_AUTHORITY.get(PidType.pmid).put(PUBMED_CENTRAL_ID, "PubMed Central"); PID_AUTHORITY.put(PidType.arXiv, HashBiMap.create()); PID_AUTHORITY.get(PidType.arXiv).put(ARXIV_ID, "arXiv.org e-Print Archive"); } public static List getPids(List pid, KeyValue collectedFrom) { return pidFromInstance(pid, collectedFrom, true).distinct().collect(Collectors.toList()); } public static String createDOIBoostIdentifier(T entity) { if (entity == null) return null; StructuredProperty pid = null; if (entity.getPid() != null) { pid = entity .getPid() .stream() .filter(Objects::nonNull) .filter(s -> s.getQualifier() != null && "doi".equalsIgnoreCase(s.getQualifier().getClassid())) .filter(CleaningFunctions::pidFilter) .findAny() .orElse(null); } else { if (entity.getInstance() != null) { pid = entity .getInstance() .stream() .filter(i -> i.getPid() != null) .flatMap(i -> i.getPid().stream()) .filter(CleaningFunctions::pidFilter) .findAny() .orElse(null); } } if (pid != null) return idFromPid(entity, pid, true); return null; } /** * Creates an identifier from the most relevant PID (if available) provided by a known PID authority in the given * entity T. Returns entity.id when none of the PIDs meet the selection criteria is available. * * @param entity the entity providing PIDs and a default ID. * @param the specific entity type. Currently Organization and Result subclasses are supported. * @param md5 indicates whether should hash the PID value or not. * @return an identifier from the most relevant PID, entity.id otherwise */ public static String createIdentifier(T entity, boolean md5) { checkArgument(StringUtils.isNoneBlank(entity.getId()), "missing entity identifier"); final Map> pids = extractPids(entity); return pids .values() .stream() .flatMap(s -> s.stream()) .min(new PidComparator<>(entity)) .map( min -> Optional .ofNullable(pids.get(min.getQualifier().getClassid())) .map( p -> p .stream() .sorted(new PidValueComparator()) .findFirst() .map(s -> idFromPid(entity, s, md5)) .orElseGet(entity::getId)) .orElseGet(entity::getId)) .orElseGet(entity::getId); } private static Map> extractPids(T entity) { if (entity instanceof Result) { return Optional .ofNullable(((Result) entity).getInstance()) .map( instance -> mapPids(instance)) .orElse(new HashMap<>()); } else { return entity .getPid() .stream() .map(CleaningFunctions::normalizePidValue) .filter(CleaningFunctions::pidFilter) .collect( Collectors .groupingBy( p -> p.getQualifier().getClassid(), Collectors.mapping(p -> p, Collectors.toCollection(HashSet::new)))); } } private static Map> mapPids(List instance) { return instance .stream() .map(i -> pidFromInstance(i.getPid(), i.getCollectedfrom(), false)) .flatMap(Function.identity()) .collect( Collectors .groupingBy( p -> p.getQualifier().getClassid(), Collectors.mapping(p -> p, Collectors.toCollection(HashSet::new)))); } private static Stream pidFromInstance(List pid, KeyValue collectedFrom, boolean mapHandles) { return Optional .ofNullable(pid) .map( pp -> pp .stream() // filter away PIDs provided by a DS that is not considered an authority for the // given PID Type .filter(p -> { return shouldFilterPid(collectedFrom, p, mapHandles); }) .map(CleaningFunctions::normalizePidValue) .filter(CleaningFunctions::pidFilter)) .orElse(Stream.empty()); } private static boolean shouldFilterPid(KeyValue collectedFrom, StructuredProperty p, boolean mapHandles) { final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid()); return (mapHandles && pType.equals(PidType.handle)) || Optional.ofNullable(collectedFrom).isPresent() && Optional .ofNullable(PID_AUTHORITY.get(pType)) .map(authorities -> { return authorities.containsKey(collectedFrom.getKey()) || authorities.containsValue(collectedFrom.getValue()); }) .orElse(false); } /** * @see {@link IdentifierFactory#createIdentifier(OafEntity, boolean)} */ public static String createIdentifier(T entity) { return createIdentifier(entity, true); } private static String idFromPid(T entity, StructuredProperty s, boolean md5) { return new StringBuilder() .append(ModelSupport.getIdPrefix(entity.getClass())) .append(ID_PREFIX_SEPARATOR) .append(createPrefix(s.getQualifier().getClassid())) .append(ID_SEPARATOR) .append(md5 ? md5(s.getValue()) : s.getValue()) .toString(); } // create the prefix (length = 12) private static String createPrefix(String pidType) { StringBuilder prefix = new StringBuilder(StringUtils.left(pidType, ID_PREFIX_LEN)); while (prefix.length() < ID_PREFIX_LEN) { prefix.append("_"); } return prefix.substring(0, ID_PREFIX_LEN); } public static String md5(final String s) { try { final MessageDigest md = MessageDigest.getInstance("MD5"); md.update(s.getBytes(StandardCharsets.UTF_8)); return new String(Hex.encodeHex(md.digest())); } catch (final Exception e) { System.err.println("Error creating id"); return null; } } }