forked from D-Net/dnet-hadoop
WIP: pid cleaning
This commit is contained in:
parent
da7b66c542
commit
1d33074fd1
|
@ -0,0 +1,14 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
public class DoiCleaningRule {
|
||||
|
||||
public static String clean(final String doi) {
|
||||
return doi
|
||||
.toLowerCase()
|
||||
.replaceAll("\\s", "")
|
||||
.replaceAll("^doi:", "")
|
||||
.replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class FundRefCleaningRule {
|
||||
|
||||
public static String clean(final String fundrefId) {
|
||||
|
||||
String s = fundrefId
|
||||
.toLowerCase()
|
||||
.replaceAll("\\s", "");
|
||||
|
||||
Matcher m = Pattern.compile("\\d+").matcher(s);
|
||||
if (m.matches()) {
|
||||
return m.group();
|
||||
} else {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -162,19 +162,15 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
}
|
||||
|
||||
public static <T extends Oaf> T fixVocabularyNames(T value) {
|
||||
if (value instanceof Datasource) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Project) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Organization) {
|
||||
Organization o = (Organization) value;
|
||||
if (Objects.nonNull(o.getCountry())) {
|
||||
fixVocabName(o.getCountry(), ModelConstants.DNET_COUNTRY_TYPE);
|
||||
}
|
||||
} else if (value instanceof Relation) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Result) {
|
||||
if (value instanceof OafEntity) {
|
||||
|
||||
OafEntity e = (OafEntity) value;
|
||||
|
||||
Optional
|
||||
.ofNullable(e.getPid())
|
||||
.ifPresent(pid -> pid.forEach(p -> fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES)));
|
||||
|
||||
if (value instanceof Result) {
|
||||
Result r = (Result) value;
|
||||
|
||||
fixVocabName(r.getLanguage(), ModelConstants.DNET_LANGUAGES);
|
||||
|
@ -188,6 +184,11 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
for (Instance i : r.getInstance()) {
|
||||
fixVocabName(i.getAccessright(), ModelConstants.DNET_ACCESS_MODES);
|
||||
fixVocabName(i.getRefereed(), ModelConstants.DNET_REVIEW_LEVELS);
|
||||
Optional
|
||||
.ofNullable(i.getPid())
|
||||
.ifPresent(
|
||||
pid -> pid.forEach(p -> fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES)));
|
||||
|
||||
}
|
||||
}
|
||||
if (Objects.nonNull(r.getAuthor())) {
|
||||
|
@ -208,6 +209,19 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
} else if (value instanceof Software) {
|
||||
|
||||
}
|
||||
} else if (value instanceof Datasource) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Project) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Organization) {
|
||||
Organization o = (Organization) value;
|
||||
if (Objects.nonNull(o.getCountry())) {
|
||||
fixVocabName(o.getCountry(), ModelConstants.DNET_COUNTRY_TYPE);
|
||||
}
|
||||
|
||||
}
|
||||
} else if (value instanceof Relation) {
|
||||
// nothing to clean here
|
||||
}
|
||||
|
||||
return value;
|
||||
|
@ -260,6 +274,14 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
}
|
||||
|
||||
public static <T extends Oaf> T cleanup(T value, VocabularyGroup vocs) {
|
||||
|
||||
if (value instanceof OafEntity) {
|
||||
|
||||
OafEntity e = (OafEntity) value;
|
||||
if (Objects.nonNull(e.getPid())) {
|
||||
e.setPid(processPidCleaning(e.getPid()));
|
||||
}
|
||||
|
||||
if (value instanceof Datasource) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Project) {
|
||||
|
@ -269,19 +291,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
|
||||
o.setCountry(ModelConstants.UNKNOWN_COUNTRY);
|
||||
}
|
||||
} else if (value instanceof Relation) {
|
||||
Relation r = (Relation) value;
|
||||
|
||||
Optional<String> validationDate = doCleanDate(r.getValidationDate());
|
||||
if (validationDate.isPresent()) {
|
||||
r.setValidationDate(validationDate.get());
|
||||
r.setValidated(true);
|
||||
} else {
|
||||
r.setValidationDate(null);
|
||||
r.setValidated(false);
|
||||
}
|
||||
} else if (value instanceof Result) {
|
||||
|
||||
Result r = (Result) value;
|
||||
|
||||
if (Objects.nonNull(r.getDateofacceptance())) {
|
||||
|
@ -394,9 +404,6 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
.map(GraphCleaningFunctions::cleanValue)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.nonNull(r.getPid())) {
|
||||
r.setPid(processPidCleaning(r.getPid()));
|
||||
}
|
||||
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
|
||||
r
|
||||
.setResourcetype(
|
||||
|
@ -405,13 +412,15 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
if (Objects.nonNull(r.getInstance())) {
|
||||
|
||||
for (Instance i : r.getInstance()) {
|
||||
if (!vocs.termExists(ModelConstants.DNET_PUBLICATION_RESOURCE, i.getInstancetype().getClassid())) {
|
||||
if (!vocs
|
||||
.termExists(ModelConstants.DNET_PUBLICATION_RESOURCE, i.getInstancetype().getClassid())) {
|
||||
if (r instanceof Publication) {
|
||||
i
|
||||
.setInstancetype(
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
"0038", "Other literature type", ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
"0038", "Other literature type",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE));
|
||||
} else if (r instanceof Dataset) {
|
||||
i
|
||||
|
@ -455,7 +464,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
});
|
||||
});
|
||||
|
||||
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
|
||||
if (Objects.isNull(i.getAccessright())
|
||||
|| StringUtils.isBlank(i.getAccessright().getClassid())) {
|
||||
i
|
||||
.setAccessright(
|
||||
accessRight(
|
||||
|
@ -478,7 +488,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
}
|
||||
}
|
||||
}
|
||||
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
|
||||
if (Objects.isNull(r.getBestaccessright())
|
||||
|| StringUtils.isBlank(r.getBestaccessright().getClassid())) {
|
||||
Qualifier bestaccessrights = OafMapperUtils.createBestAccessRights(r.getInstance());
|
||||
if (Objects.isNull(bestaccessrights)) {
|
||||
r
|
||||
|
@ -574,6 +585,20 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
} else if (value instanceof Software) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else if (value instanceof Relation) {
|
||||
Relation r = (Relation) value;
|
||||
|
||||
Optional<String> validationDate = doCleanDate(r.getValidationDate());
|
||||
if (validationDate.isPresent()) {
|
||||
r.setValidationDate(validationDate.get());
|
||||
r.setValidated(true);
|
||||
} else {
|
||||
r.setValidationDate(null);
|
||||
r.setValidated(false);
|
||||
}
|
||||
}
|
||||
|
||||
return value;
|
||||
|
@ -628,7 +653,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
|
||||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||
.map(CleaningFunctions::normalizePidValue)
|
||||
.map(PidCleaner::normalizePidValue)
|
||||
.filter(CleaningFunctions::pidFilter)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class GridCleaningRule {
|
||||
|
||||
public static String clean(String grid) {
|
||||
String s = grid
|
||||
.replaceAll("\\s", "")
|
||||
.toLowerCase();
|
||||
|
||||
Matcher m = Pattern.compile("\\d{4,6}\\.[0-9a-z]{1,2}").matcher(s);
|
||||
return m.matches() ? "grid." + m.group() : "";
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
// https://www.wikidata.org/wiki/Property:P213
|
||||
public class ISNICleaningRule {
|
||||
|
||||
public static String clean(final String isni) {
|
||||
|
||||
Matcher m = Pattern.compile("([0]{4}) ?([0-9]{4}) ?([0-9]{4}) ?([0-9]{3}[0-9X])").matcher(isni);
|
||||
if (m.matches()) {
|
||||
return String.join("", m.group(1), m.group(2), m.group(3), m.group(4));
|
||||
} else {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class PICCleaningRule {
|
||||
|
||||
public static String clean(final String pic) {
|
||||
|
||||
Matcher m = Pattern.compile("\\d{9}").matcher(pic);
|
||||
if (m.matches()) {
|
||||
return m.group();
|
||||
} else {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,62 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class PidCleaner {
|
||||
|
||||
/**
|
||||
* Utility method that normalises PID values on a per-type basis.
|
||||
* @param pid the PID whose value will be normalised.
|
||||
* @return the PID containing the normalised value.
|
||||
*/
|
||||
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
|
||||
pid
|
||||
.setValue(
|
||||
normalizePidValue(
|
||||
pid.getQualifier().getClassid(),
|
||||
pid.getValue()));
|
||||
|
||||
return pid;
|
||||
}
|
||||
|
||||
public static String normalizePidValue(String pidType, String pidValue) {
|
||||
String value = Optional
|
||||
.ofNullable(pidValue)
|
||||
.map(String::trim)
|
||||
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
|
||||
|
||||
switch (pidType) {
|
||||
|
||||
// TODO add cleaning for more PID types as needed
|
||||
|
||||
// Result
|
||||
case "doi":
|
||||
return DoiCleaningRule.clean(value);
|
||||
case "pmid":
|
||||
return PmidCleaningRule.clean(value);
|
||||
case "pmc":
|
||||
return PmcCleaningRule.clean(value);
|
||||
case "handle":
|
||||
case "arXiv":
|
||||
return value;
|
||||
|
||||
// Organization
|
||||
case "GRID":
|
||||
return GridCleaningRule.clean(value);
|
||||
case "ISNI":
|
||||
return ISNICleaningRule.clean(value);
|
||||
case "ROR":
|
||||
return RorCleaningRule.clean(value);
|
||||
case "PIC":
|
||||
return PICCleaningRule.clean(value);
|
||||
case "FundRef":
|
||||
return FundRefCleaningRule.clean(value);
|
||||
default:
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
public class PmcCleaningRule {
|
||||
|
||||
public static String clean(String pmc) {
|
||||
String s = pmc
|
||||
.replaceAll("\\s", "")
|
||||
.toUpperCase();
|
||||
return s.matches("^PMC\\d{1,8}$") ? s : "";
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
// https://researchguides.stevens.edu/c.php?g=442331&p=6577176
|
||||
public class PmidCleaningRule {
|
||||
|
||||
public static String clean(String pmid) {
|
||||
String s = pmid
|
||||
.toLowerCase()
|
||||
.replaceAll("\\s", "")
|
||||
.trim()
|
||||
.replaceAll("^0+", "");
|
||||
return s.matches("^\\d{1,8}$") ? s : "";
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
// https://ror.readme.io/docs/ror-identifier-pattern
|
||||
public class RorCleaningRule {
|
||||
|
||||
public static String clean(String ror) {
|
||||
String s = ror
|
||||
.replaceAll("\\s", "")
|
||||
.toLowerCase();
|
||||
Matcher m = Pattern.compile("0[a-hj-km-np-tv-z|0-9]{6}[0-9]{2}").matcher(s);
|
||||
return m.matches() ? "https://ror.org/" + m.group() : "";
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue