[orcidenrichment] When comparing authors manage the case of hyphenation and punctuations characters and normalizes utf strings
This commit is contained in:
parent
ce4036d6b2
commit
33eb0f60e6
|
@ -1,11 +1,12 @@
|
|||
package eu.dnetlib.dhp.utils
|
||||
|
||||
import java.text.Normalizer
|
||||
import java.util.Locale
|
||||
import java.util.regex.Pattern
|
||||
import scala.util.control.Breaks.{break, breakable}
|
||||
|
||||
object AuthorMatchers {
|
||||
val SPLIT_REGEX = Pattern.compile("[\\s,\\.]+")
|
||||
val SPLIT_REGEX = Pattern.compile("[\\s\\p{Punct}\\p{Pd}]+")
|
||||
|
||||
val WORD_DIFF = 2
|
||||
|
||||
|
@ -24,9 +25,16 @@ object AuthorMatchers {
|
|||
}
|
||||
}
|
||||
|
||||
def normalize(s: String): Array[String] = {
|
||||
SPLIT_REGEX
|
||||
.split(Normalizer.normalize(s, Normalizer.Form.NFC).toLowerCase(Locale.ROOT))
|
||||
.filter(_.nonEmpty)
|
||||
.sorted
|
||||
}
|
||||
|
||||
def matchOrderedTokenAndAbbreviations(a1: String, a2: String): Boolean = {
|
||||
val p1: Array[String] = SPLIT_REGEX.split(a1.trim.toLowerCase(Locale.ROOT)).filter(_.nonEmpty).sorted
|
||||
val p2: Array[String] = SPLIT_REGEX.split(a2.trim.toLowerCase(Locale.ROOT)).filter(_.nonEmpty).sorted
|
||||
val p1: Array[String] = normalize(a1)
|
||||
val p2: Array[String] = normalize(a2)
|
||||
|
||||
if (p1.length < 2 || p2.length < 2) return false
|
||||
if (Math.abs(p1.length - p2.length) > WORD_DIFF) return false // use alternative comparison algo
|
||||
|
|
Loading…
Reference in New Issue