From 50cc21d92e09805934c5b73060bd10803b35a77e Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 29 Jun 2021 18:35:28 +0200 Subject: [PATCH] Added method to normalize doi values (lower case, remove all preceeding 10., filtering out doi not starting with 10.) --- .../doiboost/DoiBoostMappingUtil.scala | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala index a6101c07eb..1baf55b89d 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala @@ -38,6 +38,9 @@ object DoiBoostMappingUtil { val OPENAIRE_PREFIX = "openaire____" val SEPARATOR = "::" + val DOI_PREFIX_REGEX = "(^10\\.|\\/10.)" + val DOI_PREFIX = "10." + val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;") def toActionSet(item:Oaf) :(String, String) = { @@ -352,5 +355,26 @@ object DoiBoostMappingUtil { } + def isEmpty(x: String) = x == null || x.trim.isEmpty + + def normalizeDoi(input : String) :String ={ + val replaced = input.replaceAll("(?:\\n|\\r|\\t|\\s)", "").toLowerCase.replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX) + if (isEmpty(replaced)) + return null + + if(replaced.indexOf("10.") < 0) + return null + + val ret = replaced.substring(replaced.indexOf("10.")) + + if (!ret.startsWith(DOI_PREFIX)) + return null + + return ret + + + } + + }