forked from D-Net/dnet-hadoop
[cleaning] entries avaialbe as PIDs must not appear as alternateIdentifier
This commit is contained in:
parent
972d5a3d98
commit
9588bfba81
|
@ -5,6 +5,8 @@ import java.util.*;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import com.google.common.collect.Sets;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.PidBlacklistProvider;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import com.clearspring.analytics.util.Lists;
|
import com.clearspring.analytics.util.Lists;
|
||||||
|
@ -137,19 +139,7 @@ public class CleaningFunctions {
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
if (Objects.nonNull(r.getPid())) {
|
if (Objects.nonNull(r.getPid())) {
|
||||||
r
|
r.setPid(processPidCleaning(r.getPid()));
|
||||||
.setPid(
|
|
||||||
r
|
|
||||||
.getPid()
|
|
||||||
.stream()
|
|
||||||
.filter(Objects::nonNull)
|
|
||||||
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
|
|
||||||
.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
|
|
||||||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
|
||||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
|
||||||
.map(CleaningFunctions::normalizePidValue)
|
|
||||||
.filter(CleaningFunctions::filterPid)
|
|
||||||
.collect(Collectors.toList()));
|
|
||||||
}
|
}
|
||||||
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
|
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
|
||||||
r
|
r
|
||||||
|
@ -157,7 +147,18 @@ public class CleaningFunctions {
|
||||||
qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
|
qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
|
||||||
}
|
}
|
||||||
if (Objects.nonNull(r.getInstance())) {
|
if (Objects.nonNull(r.getInstance())) {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
for (Instance i : r.getInstance()) {
|
for (Instance i : r.getInstance()) {
|
||||||
|
final Set<StructuredProperty> pids = Sets.newHashSet(i.getPid());
|
||||||
|
i.setAlternateIdentifier(
|
||||||
|
Optional.ofNullable(i.getAlternateIdentifier())
|
||||||
|
.map(altId -> altId.stream()
|
||||||
|
.filter(p -> !pids.contains(p))
|
||||||
|
.collect(Collectors.toList()))
|
||||||
|
.orElse(Lists.newArrayList()));
|
||||||
|
|
||||||
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
|
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
|
||||||
i
|
i
|
||||||
.setAccessright(
|
.setAccessright(
|
||||||
|
@ -234,6 +235,18 @@ public class CleaningFunctions {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static List<StructuredProperty> processPidCleaning(List<StructuredProperty> pids) {
|
||||||
|
return pids.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
|
||||||
|
.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
|
||||||
|
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||||
|
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||||
|
.map(CleaningFunctions::normalizePidValue)
|
||||||
|
.filter(CleaningFunctions::pidFilter)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
protected static StructuredProperty cleanValue(StructuredProperty s) {
|
protected static StructuredProperty cleanValue(StructuredProperty s) {
|
||||||
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
|
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
|
||||||
return s;
|
return s;
|
||||||
|
@ -267,25 +280,23 @@ public class CleaningFunctions {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Utility method that filter PID values on a per-type basis.
|
* Utility method that filter PID values on a per-type basis.
|
||||||
* @param pid the PID whose value will be checked.
|
* @param s the PID whose value will be checked.
|
||||||
* @return true the PID containing the normalised value.
|
* @return false if the pid matches the filter criteria, true otherwise.
|
||||||
*/
|
*/
|
||||||
private static boolean filterPid(StructuredProperty pid) {
|
public static boolean pidFilter(StructuredProperty s) {
|
||||||
String value = Optional
|
final String pidValue = s.getValue();
|
||||||
.ofNullable(pid.getValue())
|
if (Objects.isNull(s.getQualifier()) ||
|
||||||
.map(s -> StringUtils.replaceAll(s, "\\s", ""))
|
StringUtils.isBlank(pidValue) ||
|
||||||
.orElse("");
|
StringUtils.isBlank(pidValue.replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) {
|
||||||
if (StringUtils.isBlank(value)) {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
switch (pid.getQualifier().getClassid()) {
|
if (CleaningFunctions.PID_BLACKLIST.contains(pidValue)) {
|
||||||
|
return false;
|
||||||
// TODO add cleaning for more PID types as needed
|
|
||||||
case "doi":
|
|
||||||
return value.startsWith(DOI_PREFIX);
|
|
||||||
default:
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
if (PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -106,7 +106,7 @@ public class IdentifierFactory implements Serializable {
|
||||||
.getPid()
|
.getPid()
|
||||||
.stream()
|
.stream()
|
||||||
.map(CleaningFunctions::normalizePidValue)
|
.map(CleaningFunctions::normalizePidValue)
|
||||||
.filter(IdentifierFactory::pidFilter)
|
.filter(CleaningFunctions::pidFilter)
|
||||||
.collect(
|
.collect(
|
||||||
Collectors
|
Collectors
|
||||||
.groupingBy(
|
.groupingBy(
|
||||||
|
@ -136,21 +136,25 @@ public class IdentifierFactory implements Serializable {
|
||||||
// filter away PIDs provided by a DS that is not considered an authority for the
|
// filter away PIDs provided by a DS that is not considered an authority for the
|
||||||
// given PID Type
|
// given PID Type
|
||||||
.filter(p -> {
|
.filter(p -> {
|
||||||
final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid());
|
return shouldFilterPid(collectedFrom, p);
|
||||||
return Optional.ofNullable(collectedFrom).isPresent() &&
|
|
||||||
Optional
|
|
||||||
.ofNullable(PID_AUTHORITY.get(pType))
|
|
||||||
.map(authorities -> {
|
|
||||||
return authorities.containsKey(collectedFrom.getKey())
|
|
||||||
|| authorities.containsValue(collectedFrom.getValue());
|
|
||||||
})
|
|
||||||
.orElse(false);
|
|
||||||
})
|
})
|
||||||
.map(CleaningFunctions::normalizePidValue)
|
.map(CleaningFunctions::normalizePidValue)
|
||||||
.filter(IdentifierFactory::pidFilter))
|
.filter(CleaningFunctions::pidFilter))
|
||||||
.orElse(Stream.empty());
|
.orElse(Stream.empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static boolean shouldFilterPid(KeyValue collectedFrom, StructuredProperty p) {
|
||||||
|
final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid());
|
||||||
|
return pType.equals(PidType.handle) || Optional.ofNullable(collectedFrom).isPresent() &&
|
||||||
|
Optional
|
||||||
|
.ofNullable(PID_AUTHORITY.get(pType))
|
||||||
|
.map(authorities -> {
|
||||||
|
return authorities.containsKey(collectedFrom.getKey())
|
||||||
|
|| authorities.containsValue(collectedFrom.getValue());
|
||||||
|
})
|
||||||
|
.orElse(false);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @see {@link IdentifierFactory#createIdentifier(OafEntity, boolean)}
|
* @see {@link IdentifierFactory#createIdentifier(OafEntity, boolean)}
|
||||||
*/
|
*/
|
||||||
|
@ -159,22 +163,6 @@ public class IdentifierFactory implements Serializable {
|
||||||
return createIdentifier(entity, true);
|
return createIdentifier(entity, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static boolean pidFilter(StructuredProperty s) {
|
|
||||||
final String pidValue = s.getValue();
|
|
||||||
if (Objects.isNull(s.getQualifier()) ||
|
|
||||||
StringUtils.isBlank(pidValue) ||
|
|
||||||
StringUtils.isBlank(pidValue.replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (CleaningFunctions.PID_BLACKLIST.contains(pidValue)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s, boolean md5) {
|
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s, boolean md5) {
|
||||||
return new StringBuilder()
|
return new StringBuilder()
|
||||||
.append(ModelSupport.getIdPrefix(entity.getClass()))
|
.append(ModelSupport.getIdPrefix(entity.getClass()))
|
||||||
|
|
Loading…
Reference in New Issue