2020-10-06 15:44:53 +02:00
|
|
|
|
|
|
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
|
|
|
|
2020-10-07 13:14:31 +02:00
|
|
|
import java.io.Serializable;
|
|
|
|
import java.util.Objects;
|
2020-10-30 10:56:42 +01:00
|
|
|
import java.util.Optional;
|
2020-10-07 13:14:31 +02:00
|
|
|
|
|
|
|
import org.apache.commons.lang.StringUtils;
|
|
|
|
|
2020-10-06 15:44:53 +02:00
|
|
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
|
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Factory class for OpenAIRE identifiers in the Graph
|
|
|
|
*/
|
|
|
|
public class IdentifierFactory implements Serializable {
|
|
|
|
|
2020-10-30 10:56:42 +01:00
|
|
|
public static final String DOI_URL_PREFIX = "^http(s?):\\/\\/(dx\\.)?doi\\.org\\/";
|
|
|
|
|
2020-10-06 15:44:53 +02:00
|
|
|
public static final String ID_SEPARATOR = "::";
|
|
|
|
public static final String ID_PREFIX_SEPARATOR = "|";
|
2020-10-07 13:14:31 +02:00
|
|
|
public final static String ID_REGEX = "^[0-9][0-9]\\" + ID_PREFIX_SEPARATOR + ".{12}" + ID_SEPARATOR
|
|
|
|
+ "[a-zA-Z0-9]{32}$";
|
2020-10-06 15:44:53 +02:00
|
|
|
public static final int ID_PREFIX_LEN = 12;
|
2020-11-02 14:25:26 +01:00
|
|
|
public static final String NONE = "none";
|
2020-10-06 15:44:53 +02:00
|
|
|
|
2020-10-30 10:56:42 +01:00
|
|
|
/**
|
|
|
|
* Creates an identifier from the most relevant PID (if available) in the given entity T. Returns entity.id
|
|
|
|
* when no PID is available
|
|
|
|
* @param entity the entity providing PIDs and a default ID.
|
|
|
|
* @param <T> the specific entity type. Currently Organization and Result subclasses are supported.
|
|
|
|
* @return an identifier from the most relevant PID, entity.id otherwise
|
|
|
|
*/
|
2020-10-06 15:44:53 +02:00
|
|
|
public static <T extends OafEntity> String createIdentifier(T entity) {
|
|
|
|
|
|
|
|
if (Objects.isNull(entity.getPid()) || entity.getPid().isEmpty()) {
|
|
|
|
return entity.getId();
|
|
|
|
}
|
|
|
|
|
|
|
|
return entity
|
2020-10-07 13:14:31 +02:00
|
|
|
.getPid()
|
|
|
|
.stream()
|
2020-11-02 14:25:26 +01:00
|
|
|
.filter(s -> pidFilter(s))
|
2020-10-07 13:14:31 +02:00
|
|
|
.min(new PidComparator<>(entity))
|
|
|
|
.map(s -> idFromPid(entity, s))
|
|
|
|
.map(IdentifierFactory::verifyIdSyntax)
|
|
|
|
.orElseGet(entity::getId);
|
2020-10-06 15:44:53 +02:00
|
|
|
}
|
|
|
|
|
2020-11-02 14:25:26 +01:00
|
|
|
protected static boolean pidFilter(StructuredProperty s) {
|
|
|
|
return Objects.nonNull(s.getQualifier()) &&
|
|
|
|
PidType.isValid(s.getQualifier().getClassid()) &&
|
|
|
|
StringUtils.isNotBlank(StringUtils.trim(s.getValue())) &&
|
|
|
|
!NONE.equals(StringUtils.trim(StringUtils.lowerCase(s.getValue())));
|
|
|
|
}
|
|
|
|
|
2020-10-30 10:56:42 +01:00
|
|
|
/**
|
|
|
|
* Utility method that normalises PID values on a per-type basis.
|
|
|
|
* @param pid the PID whose value will be normalised.
|
|
|
|
* @return the PID containing the normalised value.
|
|
|
|
*/
|
|
|
|
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
|
|
|
|
String value = Optional
|
|
|
|
.ofNullable(pid.getValue())
|
|
|
|
.map(String::trim)
|
|
|
|
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
|
|
|
|
switch (pid.getQualifier().getClassid()) {
|
|
|
|
|
|
|
|
// TODO add cleaning for more PID types as needed
|
|
|
|
case "doi":
|
|
|
|
pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX, ""));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return pid;
|
|
|
|
}
|
|
|
|
|
2020-10-06 18:55:23 +02:00
|
|
|
private static String verifyIdSyntax(String s) {
|
2020-10-07 13:14:31 +02:00
|
|
|
if (StringUtils.isBlank(s) || !s.matches(ID_REGEX)) {
|
2020-10-06 15:44:53 +02:00
|
|
|
throw new RuntimeException(String.format("malformed id: '%s'", s));
|
|
|
|
} else {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s) {
|
|
|
|
return new StringBuilder()
|
2020-10-07 13:14:31 +02:00
|
|
|
.append(StringUtils.substringBefore(entity.getId(), ID_PREFIX_SEPARATOR))
|
|
|
|
.append(ID_PREFIX_SEPARATOR)
|
|
|
|
.append(createPrefix(s.getQualifier().getClassid()))
|
|
|
|
.append(ID_SEPARATOR)
|
2020-10-30 10:56:42 +01:00
|
|
|
.append(DHPUtils.md5(normalizePidValue(s).getValue()))
|
2020-10-07 13:14:31 +02:00
|
|
|
.toString();
|
2020-10-06 15:44:53 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// create the prefix (length = 12)
|
|
|
|
private static String createPrefix(String pidType) {
|
|
|
|
StringBuilder prefix = new StringBuilder(StringUtils.left(pidType, ID_PREFIX_LEN));
|
|
|
|
while (prefix.length() < ID_PREFIX_LEN) {
|
|
|
|
prefix.append("_");
|
|
|
|
}
|
|
|
|
return prefix.substring(0, ID_PREFIX_LEN);
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|