forked from D-Net/dnet-hadoop
Added method to normalize doi values (lower case, remove all preceeding 10., filtering out doi not starting with 10.)
This commit is contained in:
parent
6d3f960238
commit
50cc21d92e
|
@ -38,6 +38,9 @@ object DoiBoostMappingUtil {
|
||||||
val OPENAIRE_PREFIX = "openaire____"
|
val OPENAIRE_PREFIX = "openaire____"
|
||||||
val SEPARATOR = "::"
|
val SEPARATOR = "::"
|
||||||
|
|
||||||
|
val DOI_PREFIX_REGEX = "(^10\\.|\\/10.)"
|
||||||
|
val DOI_PREFIX = "10."
|
||||||
|
|
||||||
val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")
|
val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")
|
||||||
|
|
||||||
def toActionSet(item:Oaf) :(String, String) = {
|
def toActionSet(item:Oaf) :(String, String) = {
|
||||||
|
@ -352,5 +355,26 @@ object DoiBoostMappingUtil {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def isEmpty(x: String) = x == null || x.trim.isEmpty
|
||||||
|
|
||||||
|
def normalizeDoi(input : String) :String ={
|
||||||
|
val replaced = input.replaceAll("(?:\\n|\\r|\\t|\\s)", "").toLowerCase.replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX)
|
||||||
|
if (isEmpty(replaced))
|
||||||
|
return null
|
||||||
|
|
||||||
|
if(replaced.indexOf("10.") < 0)
|
||||||
|
return null
|
||||||
|
|
||||||
|
val ret = replaced.substring(replaced.indexOf("10."))
|
||||||
|
|
||||||
|
if (!ret.startsWith(DOI_PREFIX))
|
||||||
|
return null
|
||||||
|
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue