2020-10-06 15:44:53 +02:00
|
|
|
|
|
|
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
|
|
|
|
2021-03-16 17:05:38 +01:00
|
|
|
import static com.google.common.base.Preconditions.checkArgument;
|
2021-03-09 17:11:50 +01:00
|
|
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
|
|
|
|
2020-12-09 09:10:33 +01:00
|
|
|
import java.io.Serializable;
|
2021-03-09 17:11:50 +01:00
|
|
|
import java.util.*;
|
|
|
|
import java.util.function.Function;
|
2020-12-09 09:10:33 +01:00
|
|
|
import java.util.stream.Collectors;
|
2021-03-09 17:11:50 +01:00
|
|
|
import java.util.stream.Stream;
|
|
|
|
|
2021-03-17 15:14:53 +01:00
|
|
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
2021-03-09 17:11:50 +01:00
|
|
|
import org.apache.commons.lang3.StringUtils;
|
2020-10-06 15:44:53 +02:00
|
|
|
|
2021-03-09 11:37:41 +01:00
|
|
|
import com.google.common.collect.HashBiMap;
|
|
|
|
import com.google.common.collect.Maps;
|
2020-12-09 17:07:20 +01:00
|
|
|
|
2021-03-09 17:11:50 +01:00
|
|
|
import eu.dnetlib.dhp.schema.oaf.*;
|
2020-12-09 17:07:20 +01:00
|
|
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
|
|
|
|
2020-10-06 15:44:53 +02:00
|
|
|
/**
|
|
|
|
* Factory class for OpenAIRE identifiers in the Graph
|
|
|
|
*/
|
|
|
|
public class IdentifierFactory implements Serializable {
|
|
|
|
|
|
|
|
public static final String ID_SEPARATOR = "::";
|
|
|
|
public static final String ID_PREFIX_SEPARATOR = "|";
|
2020-11-03 18:43:37 +01:00
|
|
|
|
|
|
|
public final static String DOI_REGEX = "(^10\\.[0-9]{4,9}\\/[-._;()\\/:a-zA-Z0-9]+$)|" +
|
|
|
|
"(^10\\.1002\\/[^\\s]+$)|" +
|
|
|
|
"(^10\\.1021\\/[a-zA-Z0-9_][a-zA-Z0-9_][0-9]++$)|" +
|
|
|
|
"(^10\\.1207\\/[a-zA-Z0-9_]+\\&[0-9]+_[0-9]+$)";
|
|
|
|
|
2020-10-06 15:44:53 +02:00
|
|
|
public static final int ID_PREFIX_LEN = 12;
|
|
|
|
|
2021-03-09 17:11:50 +01:00
|
|
|
/**
|
2021-03-09 17:22:31 +01:00
|
|
|
* Declares the associations PID_TYPE -> [DATASOURCE ID, NAME] considered authoritative for that PID_TYPE
|
2021-03-09 17:11:50 +01:00
|
|
|
*/
|
|
|
|
public static final Map<PidType, HashBiMap<String, String>> PID_AUTHORITY = Maps.newHashMap();
|
2021-03-09 11:37:41 +01:00
|
|
|
|
|
|
|
static {
|
2021-03-09 17:11:50 +01:00
|
|
|
PID_AUTHORITY.put(PidType.doi, HashBiMap.create());
|
|
|
|
PID_AUTHORITY.get(PidType.doi).put(CROSSREF_ID, "Crossref");
|
|
|
|
PID_AUTHORITY.get(PidType.doi).put(DATACITE_ID, "Datacite");
|
|
|
|
|
|
|
|
PID_AUTHORITY.put(PidType.pmc, HashBiMap.create());
|
|
|
|
PID_AUTHORITY.get(PidType.pmc).put(EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central");
|
|
|
|
PID_AUTHORITY.get(PidType.pmc).put(PUBMED_CENTRAL_ID, "PubMed Central");
|
|
|
|
|
|
|
|
PID_AUTHORITY.put(PidType.pmid, HashBiMap.create());
|
|
|
|
PID_AUTHORITY.get(PidType.pmid).put(EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central");
|
|
|
|
PID_AUTHORITY.get(PidType.pmid).put(PUBMED_CENTRAL_ID, "PubMed Central");
|
2021-03-09 17:12:52 +01:00
|
|
|
|
|
|
|
PID_AUTHORITY.put(PidType.arXiv, HashBiMap.create());
|
|
|
|
PID_AUTHORITY.get(PidType.arXiv).put(ARXIV_ID, "arXiv.org e-Print Archive");
|
2021-03-09 11:37:41 +01:00
|
|
|
}
|
|
|
|
|
2021-03-16 14:19:32 +01:00
|
|
|
public static List<StructuredProperty> getPids(List<StructuredProperty> pid, KeyValue collectedFrom) {
|
2021-03-17 12:45:38 +01:00
|
|
|
return pidFromInstance(pid, collectedFrom).distinct().collect(Collectors.toList());
|
2021-03-16 14:19:32 +01:00
|
|
|
}
|
|
|
|
|
2020-10-30 10:56:42 +01:00
|
|
|
/**
|
2021-03-09 17:11:50 +01:00
|
|
|
* Creates an identifier from the most relevant PID (if available) provided by a known PID authority in the given
|
|
|
|
* entity T. Returns entity.id when none of the PIDs meet the selection criteria is available.
|
|
|
|
*
|
2020-10-30 10:56:42 +01:00
|
|
|
* @param entity the entity providing PIDs and a default ID.
|
|
|
|
* @param <T> the specific entity type. Currently Organization and Result subclasses are supported.
|
2020-11-30 12:00:38 +01:00
|
|
|
* @param md5 indicates whether should hash the PID value or not.
|
2020-10-30 10:56:42 +01:00
|
|
|
* @return an identifier from the most relevant PID, entity.id otherwise
|
|
|
|
*/
|
2020-11-30 12:00:38 +01:00
|
|
|
public static <T extends OafEntity> String createIdentifier(T entity, boolean md5) {
|
2021-03-09 11:37:41 +01:00
|
|
|
|
2021-03-16 17:05:38 +01:00
|
|
|
checkArgument(StringUtils.isNoneBlank(entity.getId()), "missing entity identifier");
|
|
|
|
|
2021-03-09 17:11:50 +01:00
|
|
|
final Map<String, List<StructuredProperty>> pids = extractPids(entity);
|
2020-11-23 19:16:40 +01:00
|
|
|
|
|
|
|
return pids
|
2020-11-24 14:41:39 +01:00
|
|
|
.values()
|
|
|
|
.stream()
|
|
|
|
.flatMap(s -> s.stream())
|
|
|
|
.min(new PidComparator<>(entity))
|
|
|
|
.map(
|
|
|
|
min -> Optional
|
|
|
|
.ofNullable(pids.get(min.getQualifier().getClassid()))
|
|
|
|
.map(
|
|
|
|
p -> p
|
|
|
|
.stream()
|
|
|
|
.sorted(new PidValueComparator())
|
|
|
|
.findFirst()
|
2020-11-30 12:00:38 +01:00
|
|
|
.map(s -> idFromPid(entity, s, md5))
|
2020-11-24 14:41:39 +01:00
|
|
|
.orElseGet(entity::getId))
|
|
|
|
.orElseGet(entity::getId))
|
|
|
|
.orElseGet(entity::getId);
|
2020-10-06 15:44:53 +02:00
|
|
|
}
|
|
|
|
|
2021-03-09 17:11:50 +01:00
|
|
|
private static <T extends OafEntity> Map<String, List<StructuredProperty>> extractPids(T entity) {
|
|
|
|
if (entity instanceof Result) {
|
|
|
|
return Optional
|
|
|
|
.ofNullable(((Result) entity).getInstance())
|
|
|
|
.map(
|
2021-03-16 14:19:32 +01:00
|
|
|
instance -> mapPids(instance))
|
2021-03-09 17:11:50 +01:00
|
|
|
.orElse(new HashMap<>());
|
|
|
|
} else {
|
|
|
|
return entity
|
|
|
|
.getPid()
|
|
|
|
.stream()
|
|
|
|
.map(CleaningFunctions::normalizePidValue)
|
|
|
|
.filter(IdentifierFactory::pidFilter)
|
|
|
|
.collect(
|
|
|
|
Collectors
|
|
|
|
.groupingBy(
|
|
|
|
p -> p.getQualifier().getClassid(),
|
|
|
|
Collectors.mapping(p -> p, Collectors.toList())));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-03-16 14:19:32 +01:00
|
|
|
private static Map<String, List<StructuredProperty>> mapPids(List<Instance> instance) {
|
|
|
|
return instance
|
|
|
|
.stream()
|
|
|
|
.map(i -> pidFromInstance(i.getPid(), i.getCollectedfrom()))
|
|
|
|
.flatMap(Function.identity())
|
|
|
|
.collect(
|
|
|
|
Collectors
|
|
|
|
.groupingBy(
|
|
|
|
p -> p.getQualifier().getClassid(),
|
|
|
|
Collectors.mapping(p -> p, Collectors.toList())));
|
|
|
|
}
|
|
|
|
|
|
|
|
private static Stream<StructuredProperty> pidFromInstance(List<StructuredProperty> pid, KeyValue collectedFrom) {
|
|
|
|
return Optional
|
|
|
|
.ofNullable(pid)
|
|
|
|
.map(
|
|
|
|
pp -> pp
|
|
|
|
.stream()
|
|
|
|
// filter away PIDs provided by a DS that is not considered an authority for the
|
|
|
|
// given PID Type
|
|
|
|
.filter(p -> {
|
|
|
|
final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid());
|
|
|
|
return Optional.ofNullable(collectedFrom).isPresent() &&
|
|
|
|
Optional
|
|
|
|
.ofNullable(PID_AUTHORITY.get(pType))
|
|
|
|
.map(authorities -> {
|
|
|
|
return authorities.containsKey(collectedFrom.getKey())
|
|
|
|
|| authorities.containsValue(collectedFrom.getValue());
|
|
|
|
})
|
|
|
|
.orElse(false);
|
|
|
|
})
|
|
|
|
.map(CleaningFunctions::normalizePidValue)
|
|
|
|
.filter(IdentifierFactory::pidFilter))
|
|
|
|
.orElse(Stream.empty());
|
|
|
|
}
|
|
|
|
|
2020-11-30 12:00:38 +01:00
|
|
|
/**
|
|
|
|
* @see {@link IdentifierFactory#createIdentifier(OafEntity, boolean)}
|
|
|
|
*/
|
|
|
|
public static <T extends OafEntity> String createIdentifier(T entity) {
|
|
|
|
|
|
|
|
return createIdentifier(entity, true);
|
|
|
|
}
|
|
|
|
|
2020-11-02 14:25:26 +01:00
|
|
|
protected static boolean pidFilter(StructuredProperty s) {
|
2020-12-02 09:30:34 +01:00
|
|
|
final String pidValue = s.getValue();
|
2020-11-03 18:43:37 +01:00
|
|
|
if (Objects.isNull(s.getQualifier()) ||
|
2020-12-02 09:30:34 +01:00
|
|
|
StringUtils.isBlank(pidValue) ||
|
|
|
|
StringUtils.isBlank(pidValue.replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) {
|
2020-11-30 12:00:38 +01:00
|
|
|
return false;
|
|
|
|
}
|
2020-12-02 09:30:34 +01:00
|
|
|
if (CleaningFunctions.PID_BLACKLIST.contains(pidValue)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue)) {
|
2020-11-03 18:43:37 +01:00
|
|
|
return false;
|
|
|
|
}
|
2021-03-17 15:06:05 +01:00
|
|
|
return true;
|
2020-10-30 10:56:42 +01:00
|
|
|
}
|
|
|
|
|
2020-11-30 12:00:38 +01:00
|
|
|
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s, boolean md5) {
|
2020-10-06 15:44:53 +02:00
|
|
|
return new StringBuilder()
|
2021-03-17 15:14:53 +01:00
|
|
|
.append(ModelSupport.getIdPrefix(entity.getClass()))
|
2020-10-07 13:14:31 +02:00
|
|
|
.append(ID_PREFIX_SEPARATOR)
|
|
|
|
.append(createPrefix(s.getQualifier().getClassid()))
|
|
|
|
.append(ID_SEPARATOR)
|
2020-12-02 09:30:34 +01:00
|
|
|
.append(md5 ? DHPUtils.md5(s.getValue()) : s.getValue())
|
2020-10-07 13:14:31 +02:00
|
|
|
.toString();
|
2020-10-06 15:44:53 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// create the prefix (length = 12)
|
|
|
|
private static String createPrefix(String pidType) {
|
|
|
|
StringBuilder prefix = new StringBuilder(StringUtils.left(pidType, ID_PREFIX_LEN));
|
|
|
|
while (prefix.length() < ID_PREFIX_LEN) {
|
|
|
|
prefix.append("_");
|
|
|
|
}
|
|
|
|
return prefix.substring(0, ID_PREFIX_LEN);
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|