Added method to normalize doi values (lower case, remove all preceeding 10., filtering out doi not starting with 10.)

This commit is contained in:
Miriam Baglioni 2021-06-29 18:35:28 +02:00
parent 6d3f960238
commit 50cc21d92e
1 changed files with 24 additions and 0 deletions

View File

@ -38,6 +38,9 @@ object DoiBoostMappingUtil {
val OPENAIRE_PREFIX = "openaire____"
val SEPARATOR = "::"
val DOI_PREFIX_REGEX = "(^10\\.|\\/10.)"
val DOI_PREFIX = "10."
val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")
def toActionSet(item:Oaf) :(String, String) = {
@ -352,5 +355,26 @@ object DoiBoostMappingUtil {
}
def isEmpty(x: String) = x == null || x.trim.isEmpty
def normalizeDoi(input : String) :String ={
val replaced = input.replaceAll("(?:\\n|\\r|\\t|\\s)", "").toLowerCase.replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX)
if (isEmpty(replaced))
return null
if(replaced.indexOf("10.") < 0)
return null
val ret = replaced.substring(replaced.indexOf("10."))
if (!ret.startsWith(DOI_PREFIX))
return null
return ret
}
}