forked from antonis.lempesis/dnet-hadoop
trivial: the less magic numbers and values around, the better
This commit is contained in:
parent
6848d0c3d7
commit
723b01f9e9
|
@ -16,7 +16,10 @@ import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
public class CleaningFunctions {
|
public class CleaningFunctions {
|
||||||
|
|
||||||
public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
|
public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
|
||||||
|
|
||||||
public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})";
|
public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})";
|
||||||
|
public static final int ORCID_LEN = 19;
|
||||||
|
|
||||||
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
||||||
|
|
||||||
public static final Set<String> PID_BLACKLIST = new HashSet<>();
|
public static final Set<String> PID_BLACKLIST = new HashSet<>();
|
||||||
|
@ -86,7 +89,7 @@ public class CleaningFunctions {
|
||||||
} else if (value instanceof Organization) {
|
} else if (value instanceof Organization) {
|
||||||
Organization o = (Organization) value;
|
Organization o = (Organization) value;
|
||||||
if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
|
if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
|
||||||
o.setCountry(qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_COUNTRY_TYPE));
|
o.setCountry(qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_COUNTRY_TYPE));
|
||||||
}
|
}
|
||||||
} else if (value instanceof Relation) {
|
} else if (value instanceof Relation) {
|
||||||
// nothing to clean here
|
// nothing to clean here
|
||||||
|
@ -153,12 +156,14 @@ public class CleaningFunctions {
|
||||||
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
|
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
|
||||||
r
|
r
|
||||||
.setResourcetype(
|
.setResourcetype(
|
||||||
qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
|
qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
|
||||||
}
|
}
|
||||||
if (Objects.nonNull(r.getInstance())) {
|
if (Objects.nonNull(r.getInstance())) {
|
||||||
for (Instance i : r.getInstance()) {
|
for (Instance i : r.getInstance()) {
|
||||||
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
|
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
|
||||||
i.setAccessright(qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
|
i
|
||||||
|
.setAccessright(
|
||||||
|
qualifier(ModelConstants.UNKNOWN, "not available", ModelConstants.DNET_ACCESS_MODES));
|
||||||
}
|
}
|
||||||
if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
|
if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
|
||||||
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
|
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
|
||||||
|
@ -173,7 +178,7 @@ public class CleaningFunctions {
|
||||||
if (Objects.isNull(bestaccessrights)) {
|
if (Objects.isNull(bestaccessrights)) {
|
||||||
r
|
r
|
||||||
.setBestaccessright(
|
.setBestaccessright(
|
||||||
qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
|
qualifier(ModelConstants.UNKNOWN, "not available", ModelConstants.DNET_ACCESS_MODES));
|
||||||
} else {
|
} else {
|
||||||
r.setBestaccessright(bestaccessrights);
|
r.setBestaccessright(bestaccessrights);
|
||||||
}
|
}
|
||||||
|
@ -227,7 +232,7 @@ public class CleaningFunctions {
|
||||||
.trim()
|
.trim()
|
||||||
.toLowerCase()
|
.toLowerCase()
|
||||||
.replaceAll(ORCID_CLEANING_REGEX, "$1-$2-$3-$4");
|
.replaceAll(ORCID_CLEANING_REGEX, "$1-$2-$3-$4");
|
||||||
if (orcid.length() == 19) {
|
if (orcid.length() == ORCID_LEN) {
|
||||||
p.setValue(orcid);
|
p.setValue(orcid);
|
||||||
} else {
|
} else {
|
||||||
p.setValue("");
|
p.setValue("");
|
||||||
|
|
Loading…
Reference in New Issue