From 34d653de41e6264174c6bbecf26b1d356235c2ec Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 22 Jan 2021 14:16:33 +0100 Subject: [PATCH] [Cleaning] updated cleaning rule for DOIs --- .../java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java index 885d761c53..0cbf7219af 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java @@ -15,7 +15,7 @@ import eu.dnetlib.dhp.schema.oaf.*; public class CleaningFunctions { - public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/"; + public static final String DOI_PREFIX_REGEX = "^10\\."; public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})"; public static final int ORCID_LEN = 19; @@ -308,7 +308,7 @@ public class CleaningFunctions { // TODO add cleaning for more PID types as needed case "doi": - pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX_REGEX, "")); + pid.setValue(value.toLowerCase().replaceAll(DOI_PREFIX_REGEX, "10.")); break; } return pid;