forked from D-Net/dnet-hadoop
introduced filtering for DOIs in graph cleaning workflow
This commit is contained in:
parent
21ddcf3a73
commit
491ad24750
|
@ -13,7 +13,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
|
||||||
public class CleaningFunctions {
|
public class CleaningFunctions {
|
||||||
|
|
||||||
public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
|
public static final String DOI_PREFIX_REGEX = "^.*10\\.";
|
||||||
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
|
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
|
||||||
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
||||||
|
|
||||||
|
@ -146,6 +146,7 @@ public class CleaningFunctions {
|
||||||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||||
.map(CleaningFunctions::normalizePidValue)
|
.map(CleaningFunctions::normalizePidValue)
|
||||||
|
.filter(CleaningFunctions::filterPid)
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
|
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
|
||||||
|
@ -253,6 +254,29 @@ public class CleaningFunctions {
|
||||||
classid, classname, scheme, scheme);
|
classid, classname, scheme, scheme);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility method that filter PID values on a per-type basis.
|
||||||
|
* @param pid the PID whose value will be checked.
|
||||||
|
* @return true the PID containing the normalised value.
|
||||||
|
*/
|
||||||
|
private static boolean filterPid(StructuredProperty pid) {
|
||||||
|
String value = Optional
|
||||||
|
.ofNullable(pid.getValue())
|
||||||
|
.map(s -> StringUtils.replaceAll(s, "\\s", ""))
|
||||||
|
.orElse("");
|
||||||
|
if (StringUtils.isBlank(value)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
switch (pid.getQualifier().getClassid()) {
|
||||||
|
|
||||||
|
// TODO add cleaning for more PID types as needed
|
||||||
|
case "doi":
|
||||||
|
return value.startsWith("10.");
|
||||||
|
default:
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Utility method that normalises PID values on a per-type basis.
|
* Utility method that normalises PID values on a per-type basis.
|
||||||
* @param pid the PID whose value will be normalised.
|
* @param pid the PID whose value will be normalised.
|
||||||
|
@ -267,7 +291,7 @@ public class CleaningFunctions {
|
||||||
|
|
||||||
// TODO add cleaning for more PID types as needed
|
// TODO add cleaning for more PID types as needed
|
||||||
case "doi":
|
case "doi":
|
||||||
pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX_REGEX, ""));
|
pid.setValue(value.toLowerCase().replaceAll(DOI_PREFIX_REGEX, "10."));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
return pid;
|
return pid;
|
||||||
|
|
|
@ -1,18 +1,18 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.*;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
|
import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
|
||||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Factory class for OpenAIRE identifiers in the Graph
|
* Factory class for OpenAIRE identifiers in the Graph
|
||||||
|
@ -21,8 +21,6 @@ public class IdentifierFactory implements Serializable {
|
||||||
|
|
||||||
public static final String ID_SEPARATOR = "::";
|
public static final String ID_SEPARATOR = "::";
|
||||||
public static final String ID_PREFIX_SEPARATOR = "|";
|
public static final String ID_PREFIX_SEPARATOR = "|";
|
||||||
public final static String ID_REGEX = "^[0-9][0-9]\\" + ID_PREFIX_SEPARATOR + ".{12}" + ID_SEPARATOR
|
|
||||||
+ "[a-zA-Z0-9]{32}$";
|
|
||||||
|
|
||||||
public final static String DOI_REGEX = "(^10\\.[0-9]{4,9}\\/[-._;()\\/:a-zA-Z0-9]+$)|" +
|
public final static String DOI_REGEX = "(^10\\.[0-9]{4,9}\\/[-._;()\\/:a-zA-Z0-9]+$)|" +
|
||||||
"(^10\\.1002\\/[^\\s]+$)|" +
|
"(^10\\.1002\\/[^\\s]+$)|" +
|
||||||
|
|
Loading…
Reference in New Issue