diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java index 798eae4ddd..859fc074c3 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java @@ -16,7 +16,10 @@ import eu.dnetlib.dhp.schema.oaf.*; public class CleaningFunctions { public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/"; + public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})"; + public static final int ORCID_LEN = 19; + public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)"; public static final Set PID_BLACKLIST = new HashSet<>(); @@ -86,7 +89,7 @@ public class CleaningFunctions { } else if (value instanceof Organization) { Organization o = (Organization) value; if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) { - o.setCountry(qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_COUNTRY_TYPE)); + o.setCountry(qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_COUNTRY_TYPE)); } } else if (value instanceof Relation) { // nothing to clean here @@ -153,12 +156,14 @@ public class CleaningFunctions { if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) { r .setResourcetype( - qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE)); + qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE)); } if (Objects.nonNull(r.getInstance())) { for (Instance i : r.getInstance()) { if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) { - i.setAccessright(qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES)); + i + .setAccessright( + qualifier(ModelConstants.UNKNOWN, "not available", ModelConstants.DNET_ACCESS_MODES)); } if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) { i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY); @@ -173,7 +178,7 @@ public class CleaningFunctions { if (Objects.isNull(bestaccessrights)) { r .setBestaccessright( - qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES)); + qualifier(ModelConstants.UNKNOWN, "not available", ModelConstants.DNET_ACCESS_MODES)); } else { r.setBestaccessright(bestaccessrights); } @@ -227,7 +232,7 @@ public class CleaningFunctions { .trim() .toLowerCase() .replaceAll(ORCID_CLEANING_REGEX, "$1-$2-$3-$4"); - if (orcid.length() == 19) { + if (orcid.length() == ORCID_LEN) { p.setValue(orcid); } else { p.setValue("");