2024-04-15 18:19:29 +02:00
|
|
|
package eu.dnetlib.pace.util
|
2024-02-04 10:07:15 +01:00
|
|
|
|
|
|
|
import java.util.Locale
|
|
|
|
import java.util.regex.Pattern
|
2024-04-15 18:19:29 +02:00
|
|
|
import scala.util.control.Breaks.{break, breakable}
|
2024-02-04 10:07:15 +01:00
|
|
|
|
2024-04-15 18:19:29 +02:00
|
|
|
object AuthorMatchers {
|
2024-02-04 10:07:15 +01:00
|
|
|
val SPLIT_REGEX = Pattern.compile("[\\s,\\.]+")
|
|
|
|
|
|
|
|
val WORD_DIFF = 2
|
|
|
|
|
|
|
|
def matchEqualsIgnoreCase(a1: String, a2: String): Boolean = {
|
|
|
|
if (a1 == null || a2 == null)
|
|
|
|
false
|
|
|
|
else
|
|
|
|
a1 == a2 || a1.toLowerCase(Locale.ROOT).equals(a2.toLowerCase(Locale.ROOT))
|
|
|
|
}
|
|
|
|
|
|
|
|
def matchOtherNames(fullName: String, otherNames: Seq[String]): Boolean = {
|
|
|
|
if (otherNames != null) {
|
|
|
|
otherNames.exists(matchEqualsIgnoreCase(fullName, _))
|
|
|
|
} else {
|
|
|
|
false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
def matchOrderedTokenAndAbbreviations(a1: String, a2: String): Boolean = {
|
|
|
|
val p1: Array[String] = SPLIT_REGEX.split(a1.trim.toLowerCase(Locale.ROOT)).filter(_.nonEmpty).sorted
|
|
|
|
val p2: Array[String] = SPLIT_REGEX.split(a2.trim.toLowerCase(Locale.ROOT)).filter(_.nonEmpty).sorted
|
|
|
|
|
|
|
|
if (p1.length < 2 || p2.length < 2) return false
|
|
|
|
if (Math.abs(p1.length - p2.length) > WORD_DIFF) return false // use alternative comparison algo
|
|
|
|
|
|
|
|
var p1Idx: Int = 0
|
|
|
|
var p2Idx: Int = 0
|
|
|
|
var shortMatches: Int = 0
|
|
|
|
var longMatches: Int = 0
|
|
|
|
while (p1Idx < p1.length && p2Idx < p2.length) {
|
|
|
|
val e1: String = p1(p1Idx)
|
|
|
|
val c1: Char = e1.charAt(0)
|
|
|
|
val e2: String = p2(p2Idx)
|
|
|
|
val c2: Char = e2.charAt(0)
|
|
|
|
if (c1 < c2) p1Idx += 1
|
|
|
|
else if (c1 > c2) p2Idx += 1
|
|
|
|
else {
|
|
|
|
var res: Boolean = false
|
|
|
|
if (e1.length != 1 && e2.length != 1) {
|
|
|
|
res = e1 == e2
|
2024-04-15 18:19:29 +02:00
|
|
|
if (res)
|
|
|
|
longMatches += 1
|
2024-02-04 10:07:15 +01:00
|
|
|
} else {
|
|
|
|
res = true
|
|
|
|
shortMatches += 1
|
|
|
|
}
|
|
|
|
if (res) {
|
|
|
|
p1Idx += 1
|
|
|
|
p2Idx += 1
|
|
|
|
} else {
|
|
|
|
val diff: Int = e1.compareTo(e2)
|
|
|
|
if (diff < 0) p1Idx += 1
|
|
|
|
else if (diff > 0) p2Idx += 1
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
longMatches > 0 && (shortMatches + longMatches) == Math.min(p1.length, p2.length)
|
|
|
|
}
|
2024-04-15 18:19:29 +02:00
|
|
|
|
|
|
|
def removeMatches(
|
|
|
|
graph_authors: java.util.List[String],
|
|
|
|
orcid_authors: java.util.List[String],
|
|
|
|
matchingFunc: java.util.function.BiFunction[String,String,Boolean]
|
|
|
|
) : java.util.List[String] = {
|
|
|
|
removeMatches(graph_authors, orcid_authors, (a, b) => matchingFunc(a,b))
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def removeMatches(
|
|
|
|
graph_authors: java.util.List[String],
|
|
|
|
orcid_authors: java.util.List[String],
|
|
|
|
matchingFunc: (String, String) => Boolean
|
|
|
|
) : java.util.List[String] = {
|
|
|
|
val matched = new java.util.ArrayList[String]()
|
|
|
|
|
|
|
|
if (graph_authors != null && !graph_authors.isEmpty) {
|
|
|
|
val ait = graph_authors.iterator
|
|
|
|
|
|
|
|
while (ait.hasNext) {
|
|
|
|
val author = ait.next()
|
|
|
|
val oit = orcid_authors.iterator
|
|
|
|
|
|
|
|
breakable {
|
|
|
|
while (oit.hasNext) {
|
|
|
|
val orcid = oit.next()
|
|
|
|
|
|
|
|
if (matchingFunc(author, orcid)) {
|
|
|
|
ait.remove()
|
|
|
|
oit.remove()
|
|
|
|
|
|
|
|
matched.add(author)
|
|
|
|
matched.add(orcid)
|
|
|
|
|
|
|
|
break()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
matched
|
|
|
|
}
|
|
|
|
|
2024-02-04 10:07:15 +01:00
|
|
|
}
|