90 lines
2.3 KiB
Java
90 lines
2.3 KiB
Java
|
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
|
|
|
import java.util.HashSet;
|
|
import java.util.Objects;
|
|
import java.util.Optional;
|
|
import java.util.Set;
|
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.Pid;
|
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
|
|
public class CleaningFunctions {
|
|
|
|
public static final String DOI_PREFIX_REGEX = "(^10\\.|\\/10\\.)";
|
|
public static final String DOI_PREFIX = "10.";
|
|
|
|
public static final Set<String> PID_BLACKLIST = new HashSet<>();
|
|
|
|
static {
|
|
PID_BLACKLIST.add("none");
|
|
PID_BLACKLIST.add("na");
|
|
}
|
|
|
|
public CleaningFunctions() {}
|
|
|
|
/**
|
|
* Utility method that filter PID values on a per-type basis.
|
|
* @param s the PID whose value will be checked.
|
|
* @return false if the pid matches the filter criteria, true otherwise.
|
|
*/
|
|
public static boolean pidFilter(Pid s) {
|
|
final String pidValue = s.getValue();
|
|
if (Objects.isNull(s.getQualifier()) ||
|
|
StringUtils.isBlank(pidValue) ||
|
|
StringUtils.isBlank(pidValue.replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) {
|
|
return false;
|
|
}
|
|
if (CleaningFunctions.PID_BLACKLIST.contains(pidValue)) {
|
|
return false;
|
|
}
|
|
return !PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue);
|
|
}
|
|
|
|
/**
|
|
* Utility method that normalises PID values on a per-type basis.
|
|
* @param pid the PID whose value will be normalised.
|
|
* @return the PID containing the normalised value.
|
|
*/
|
|
public static Pid normalizePidValue(Pid pid) {
|
|
pid.setValue(
|
|
normalizePidValue(
|
|
pid.getQualifier().getClassid(),
|
|
pid.getValue()));
|
|
|
|
return pid;
|
|
}
|
|
|
|
/**
|
|
* Utility method that normalises PID values on a per-type basis.
|
|
* @param pid the PID whose value will be normalised.
|
|
* @return the PID containing the normalised value.
|
|
*/
|
|
public static StructuredProperty normalizeSPValue(StructuredProperty pid) {
|
|
pid.setValue(
|
|
normalizePidValue(
|
|
pid.getQualifier().getClassid(),
|
|
pid.getValue()));
|
|
|
|
return pid;
|
|
}
|
|
|
|
public static String normalizePidValue(String pidType, String pidValue) {
|
|
String value = Optional
|
|
.ofNullable(pidValue)
|
|
.map(String::trim)
|
|
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
|
|
|
|
switch (pidType) {
|
|
|
|
// TODO add cleaning for more PID types as needed
|
|
case "doi":
|
|
return value.toLowerCase().replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX);
|
|
}
|
|
return value;
|
|
}
|
|
|
|
}
|