[cleaning] entries avaialbe as PIDs must not appear as alternateIdentifier

This commit is contained in:
Claudio Atzori 2021-03-19 09:07:30 +01:00
parent 972d5a3d98
commit 9588bfba81
2 changed files with 54 additions and 55 deletions

View File

@ -5,6 +5,8 @@ import java.util.*;
import java.util.function.Function; import java.util.function.Function;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.schema.oaf.utils.PidBlacklistProvider;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import com.clearspring.analytics.util.Lists; import com.clearspring.analytics.util.Lists;
@ -137,19 +139,7 @@ public class CleaningFunctions {
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
if (Objects.nonNull(r.getPid())) { if (Objects.nonNull(r.getPid())) {
r r.setPid(processPidCleaning(r.getPid()));
.setPid(
r
.getPid()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(CleaningFunctions::normalizePidValue)
.filter(CleaningFunctions::filterPid)
.collect(Collectors.toList()));
} }
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) { if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
r r
@ -157,7 +147,18 @@ public class CleaningFunctions {
qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE)); qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
} }
if (Objects.nonNull(r.getInstance())) { if (Objects.nonNull(r.getInstance())) {
for (Instance i : r.getInstance()) { for (Instance i : r.getInstance()) {
final Set<StructuredProperty> pids = Sets.newHashSet(i.getPid());
i.setAlternateIdentifier(
Optional.ofNullable(i.getAlternateIdentifier())
.map(altId -> altId.stream()
.filter(p -> !pids.contains(p))
.collect(Collectors.toList()))
.orElse(Lists.newArrayList()));
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) { if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
i i
.setAccessright( .setAccessright(
@ -234,6 +235,18 @@ public class CleaningFunctions {
return value; return value;
} }
private static List<StructuredProperty> processPidCleaning(List<StructuredProperty> pids) {
return pids.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(CleaningFunctions::normalizePidValue)
.filter(CleaningFunctions::pidFilter)
.collect(Collectors.toList());
}
protected static StructuredProperty cleanValue(StructuredProperty s) { protected static StructuredProperty cleanValue(StructuredProperty s) {
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " ")); s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
return s; return s;
@ -267,25 +280,23 @@ public class CleaningFunctions {
/** /**
* Utility method that filter PID values on a per-type basis. * Utility method that filter PID values on a per-type basis.
* @param pid the PID whose value will be checked. * @param s the PID whose value will be checked.
* @return true the PID containing the normalised value. * @return false if the pid matches the filter criteria, true otherwise.
*/ */
private static boolean filterPid(StructuredProperty pid) { public static boolean pidFilter(StructuredProperty s) {
String value = Optional final String pidValue = s.getValue();
.ofNullable(pid.getValue()) if (Objects.isNull(s.getQualifier()) ||
.map(s -> StringUtils.replaceAll(s, "\\s", "")) StringUtils.isBlank(pidValue) ||
.orElse(""); StringUtils.isBlank(pidValue.replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) {
if (StringUtils.isBlank(value)) {
return false; return false;
} }
switch (pid.getQualifier().getClassid()) { if (CleaningFunctions.PID_BLACKLIST.contains(pidValue)) {
return false;
// TODO add cleaning for more PID types as needed
case "doi":
return value.startsWith(DOI_PREFIX);
default:
return true;
} }
if (PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue)) {
return false;
}
return true;
} }
/** /**

View File

@ -106,7 +106,7 @@ public class IdentifierFactory implements Serializable {
.getPid() .getPid()
.stream() .stream()
.map(CleaningFunctions::normalizePidValue) .map(CleaningFunctions::normalizePidValue)
.filter(IdentifierFactory::pidFilter) .filter(CleaningFunctions::pidFilter)
.collect( .collect(
Collectors Collectors
.groupingBy( .groupingBy(
@ -136,21 +136,25 @@ public class IdentifierFactory implements Serializable {
// filter away PIDs provided by a DS that is not considered an authority for the // filter away PIDs provided by a DS that is not considered an authority for the
// given PID Type // given PID Type
.filter(p -> { .filter(p -> {
final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid()); return shouldFilterPid(collectedFrom, p);
return Optional.ofNullable(collectedFrom).isPresent() &&
Optional
.ofNullable(PID_AUTHORITY.get(pType))
.map(authorities -> {
return authorities.containsKey(collectedFrom.getKey())
|| authorities.containsValue(collectedFrom.getValue());
})
.orElse(false);
}) })
.map(CleaningFunctions::normalizePidValue) .map(CleaningFunctions::normalizePidValue)
.filter(IdentifierFactory::pidFilter)) .filter(CleaningFunctions::pidFilter))
.orElse(Stream.empty()); .orElse(Stream.empty());
} }
private static boolean shouldFilterPid(KeyValue collectedFrom, StructuredProperty p) {
final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid());
return pType.equals(PidType.handle) || Optional.ofNullable(collectedFrom).isPresent() &&
Optional
.ofNullable(PID_AUTHORITY.get(pType))
.map(authorities -> {
return authorities.containsKey(collectedFrom.getKey())
|| authorities.containsValue(collectedFrom.getValue());
})
.orElse(false);
}
/** /**
* @see {@link IdentifierFactory#createIdentifier(OafEntity, boolean)} * @see {@link IdentifierFactory#createIdentifier(OafEntity, boolean)}
*/ */
@ -159,22 +163,6 @@ public class IdentifierFactory implements Serializable {
return createIdentifier(entity, true); return createIdentifier(entity, true);
} }
protected static boolean pidFilter(StructuredProperty s) {
final String pidValue = s.getValue();
if (Objects.isNull(s.getQualifier()) ||
StringUtils.isBlank(pidValue) ||
StringUtils.isBlank(pidValue.replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) {
return false;
}
if (CleaningFunctions.PID_BLACKLIST.contains(pidValue)) {
return false;
}
if (PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue)) {
return false;
}
return true;
}
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s, boolean md5) { private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s, boolean md5) {
return new StringBuilder() return new StringBuilder()
.append(ModelSupport.getIdPrefix(entity.getClass())) .append(ModelSupport.getIdPrefix(entity.getClass()))