Enrich authors with ORCID info using new matching algorithm
parent
3cd5590f3b
commit
9092075760
@ -1,40 +0,0 @@
|
||||
package eu.dnetlib.dhp.enrich.orcid
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.{Author, Publication}
|
||||
import eu.dnetlib.dhp.schema.sx.OafUtils
|
||||
import org.apache.spark.sql.Row
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object AuthorEnricher extends Serializable {
|
||||
|
||||
def createAuthor(givenName: String, familyName: String, orcid: String): Author = {
|
||||
val a = new Author
|
||||
a.setName(givenName)
|
||||
a.setSurname(familyName)
|
||||
a.setFullname(s"$givenName $familyName")
|
||||
val pid = OafUtils.createSP(orcid, ModelConstants.ORCID, ModelConstants.ORCID)
|
||||
pid.setDataInfo(OafUtils.generateDataInfo())
|
||||
pid.getDataInfo.setProvenanceaction(OafUtils.createQualifier("ORCID_ENRICHMENT", "ORCID_ENRICHMENT"))
|
||||
a.setPid(List(pid).asJava)
|
||||
a
|
||||
}
|
||||
|
||||
def toOAFAuthor(r: Row): java.util.List[Author] = {
|
||||
r.getList[Row](1)
|
||||
.asScala
|
||||
.map(s => createAuthor(s.getAs[String]("givenName"), s.getAs[String]("familyName"), s.getAs[String]("orcid")))
|
||||
.toList
|
||||
.asJava
|
||||
}
|
||||
|
||||
// def enrichAuthor(p:Publication,r:Row): Unit = {
|
||||
// val k:Map[String, OAuthor] =r.getList[Row](1).asScala.map(s => (s.getAs[String]("orcid"), OAuthor(s.getAs[String]("givenName") ,s.getAs[String]("familyName") ))).groupBy(_._1).mapValues(_.map(_._2).head)
|
||||
// println(k)
|
||||
//
|
||||
//
|
||||
//
|
||||
// }
|
||||
|
||||
}
|
@ -0,0 +1,128 @@
|
||||
package eu.dnetlib.dhp.enrich.orcid
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.{Author, StructuredProperty}
|
||||
import eu.dnetlib.dhp.schema.sx.OafUtils
|
||||
|
||||
import java.util
|
||||
import scala.beans.BeanProperty
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.util.control.Breaks.{break, breakable}
|
||||
|
||||
case class ORCIDAuthorEnricherResult(
|
||||
@BeanProperty var id: String,
|
||||
@BeanProperty var enriched_author: java.util.List[Author],
|
||||
@BeanProperty var author_matched: java.util.List[MatchedAuthors],
|
||||
@BeanProperty var author_unmatched: java.util.List[Author],
|
||||
@BeanProperty var orcid_unmatched: java.util.List[OrcidAutor]
|
||||
)
|
||||
|
||||
object ORCIDAuthorEnricher extends Serializable {
|
||||
|
||||
def enrichOrcid(
|
||||
id: String,
|
||||
graph_authors: java.util.List[Author],
|
||||
orcid_authors: java.util.List[OrcidAutor]
|
||||
): ORCIDAuthorEnricherResult = {
|
||||
// Author enriching strategy:
|
||||
// 1) create a copy of graph author list in unmatched_authors
|
||||
// 2) find best match in unmatched_authors, remove it from unmatched_authors and enrich it so
|
||||
// that the enrichment is reflected in graph_authors (they share author instances)
|
||||
// 3) repeat (2) till the end of the list and then with different matching algorithms that have decreasing
|
||||
// trust in their output
|
||||
// At the end unmatched_authors will contain authors not matched with any of the matching algos
|
||||
val unmatched_authors = new util.ArrayList[Author](graph_authors)
|
||||
|
||||
val matches = {
|
||||
// Look after exact fullname match, reconstruct ORCID fullname as givenName + familyName
|
||||
extractAndEnrichMatches(
|
||||
unmatched_authors,
|
||||
orcid_authors,
|
||||
(author, orcid) =>
|
||||
ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.givenName + " " + orcid.familyName),
|
||||
"fullName"
|
||||
) ++
|
||||
// Look after exact reversed fullname match, reconstruct ORCID fullname as familyName + givenName
|
||||
extractAndEnrichMatches(
|
||||
unmatched_authors,
|
||||
orcid_authors,
|
||||
(author, orcid) =>
|
||||
ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.familyName + " " + orcid.givenName),
|
||||
"reversedFullName"
|
||||
) ++
|
||||
// split author names in tokens, order the tokens, then check for matches of full tokens or abbreviations
|
||||
extractAndEnrichMatches(
|
||||
unmatched_authors,
|
||||
orcid_authors,
|
||||
(author, orcid) =>
|
||||
ORCIDAuthorMatchers
|
||||
.matchOrderedTokenAndAbbreviations(author.getFullname, orcid.givenName + " " + orcid.familyName),
|
||||
"orderedTokens"
|
||||
) ++
|
||||
// look after exact matches of ORCID creditName
|
||||
extractAndEnrichMatches(
|
||||
unmatched_authors,
|
||||
orcid_authors,
|
||||
(author, orcid) => ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.creditName),
|
||||
"creditName"
|
||||
) ++
|
||||
// look after exact matches in ORCID otherNames
|
||||
extractAndEnrichMatches(
|
||||
unmatched_authors,
|
||||
orcid_authors,
|
||||
(author, orcid) =>
|
||||
orcid.otherNames != null && ORCIDAuthorMatchers.matchOtherNames(author.getFullname, orcid.otherNames.asScala),
|
||||
"otherNames"
|
||||
)
|
||||
}
|
||||
|
||||
ORCIDAuthorEnricherResult(id, graph_authors, matches.asJava, unmatched_authors, orcid_authors)
|
||||
}
|
||||
|
||||
private def extractAndEnrichMatches(
|
||||
graph_authors: java.util.List[Author],
|
||||
orcid_authors: java.util.List[OrcidAutor],
|
||||
matchingFunc: (Author, OrcidAutor) => Boolean,
|
||||
matchName: String
|
||||
) = {
|
||||
val matched = scala.collection.mutable.ArrayBuffer.empty[MatchedAuthors]
|
||||
|
||||
if (graph_authors != null && graph_authors.isEmpty) {
|
||||
val ait = graph_authors.iterator
|
||||
|
||||
while (ait.hasNext) {
|
||||
val author = ait.next()
|
||||
val oit = orcid_authors.iterator
|
||||
|
||||
breakable {
|
||||
while (oit.hasNext) {
|
||||
val orcid = oit.next()
|
||||
|
||||
if (matchingFunc(author, orcid)) {
|
||||
ait.remove()
|
||||
oit.remove()
|
||||
matched += MatchedAuthors(author, orcid, matchName)
|
||||
|
||||
if (author.getPid == null) {
|
||||
author.setPid(new util.ArrayList[StructuredProperty]())
|
||||
}
|
||||
|
||||
val orcidPID = OafUtils.createSP(orcid.orcid, ModelConstants.ORCID, ModelConstants.ORCID)
|
||||
orcidPID.setDataInfo(OafUtils.generateDataInfo())
|
||||
orcidPID.getDataInfo.setProvenanceaction(
|
||||
OafUtils.createQualifier("ORCID_ENRICHMENT", "ORCID_ENRICHMENT")
|
||||
)
|
||||
|
||||
author.getPid.add(orcidPID)
|
||||
|
||||
break()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
matched
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,65 @@
|
||||
package eu.dnetlib.dhp.enrich.orcid
|
||||
|
||||
import java.util.Locale
|
||||
import java.util.regex.Pattern
|
||||
|
||||
object ORCIDAuthorMatchers {
|
||||
val SPLIT_REGEX = Pattern.compile("[\\s,\\.]+")
|
||||
|
||||
val WORD_DIFF = 2
|
||||
|
||||
def matchEqualsIgnoreCase(a1: String, a2: String): Boolean = {
|
||||
if (a1 == null || a2 == null)
|
||||
false
|
||||
else
|
||||
a1 == a2 || a1.toLowerCase(Locale.ROOT).equals(a2.toLowerCase(Locale.ROOT))
|
||||
}
|
||||
|
||||
def matchOtherNames(fullName: String, otherNames: Seq[String]): Boolean = {
|
||||
if (otherNames != null) {
|
||||
otherNames.exists(matchEqualsIgnoreCase(fullName, _))
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
def matchOrderedTokenAndAbbreviations(a1: String, a2: String): Boolean = {
|
||||
val p1: Array[String] = SPLIT_REGEX.split(a1.trim.toLowerCase(Locale.ROOT)).filter(_.nonEmpty).sorted
|
||||
val p2: Array[String] = SPLIT_REGEX.split(a2.trim.toLowerCase(Locale.ROOT)).filter(_.nonEmpty).sorted
|
||||
|
||||
if (p1.length < 2 || p2.length < 2) return false
|
||||
if (Math.abs(p1.length - p2.length) > WORD_DIFF) return false // use alternative comparison algo
|
||||
|
||||
var p1Idx: Int = 0
|
||||
var p2Idx: Int = 0
|
||||
var shortMatches: Int = 0
|
||||
var longMatches: Int = 0
|
||||
while (p1Idx < p1.length && p2Idx < p2.length) {
|
||||
val e1: String = p1(p1Idx)
|
||||
val c1: Char = e1.charAt(0)
|
||||
val e2: String = p2(p2Idx)
|
||||
val c2: Char = e2.charAt(0)
|
||||
if (c1 < c2) p1Idx += 1
|
||||
else if (c1 > c2) p2Idx += 1
|
||||
else {
|
||||
var res: Boolean = false
|
||||
if (e1.length != 1 && e2.length != 1) {
|
||||
res = e1 == e2
|
||||
longMatches += 1
|
||||
} else {
|
||||
res = true
|
||||
shortMatches += 1
|
||||
}
|
||||
if (res) {
|
||||
p1Idx += 1
|
||||
p2Idx += 1
|
||||
} else {
|
||||
val diff: Int = e1.compareTo(e2)
|
||||
if (diff < 0) p1Idx += 1
|
||||
else if (diff > 0) p2Idx += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
longMatches > 0 && (shortMatches + longMatches) == Math.min(p1.length, p2.length)
|
||||
}
|
||||
}
|
@ -0,0 +1,35 @@
|
||||
package eu.dnetlib.dhp.enrich.orcid
|
||||
|
||||
import eu.dnetlib.dhp.enrich.orcid.ORCIDAuthorMatchers.matchOrderedTokenAndAbbreviations
|
||||
import org.junit.jupiter.api.Assertions.{assertFalse, assertTrue}
|
||||
import org.junit.jupiter.api.Test
|
||||
|
||||
class ORCIDAuthorMatchersTest {
|
||||
|
||||
@Test def testShortNames(): Unit = {
|
||||
assertTrue(matchOrderedTokenAndAbbreviations("Lasagni Mariozzi Federico", "Lasagni F. Mariozzi"))
|
||||
}
|
||||
|
||||
@Test def testInvertedNames(): Unit = {
|
||||
assertTrue(matchOrderedTokenAndAbbreviations("Andrea, Paolo Marcello", "Marcello Paolo, Andrea"))
|
||||
}
|
||||
|
||||
@Test def testHomonymy(): Unit = {
|
||||
assertTrue(matchOrderedTokenAndAbbreviations("Jang Myung Lee", "J Lee"))
|
||||
}
|
||||
|
||||
@Test def testAmbiguousShortNames(): Unit = {
|
||||
assertFalse(matchOrderedTokenAndAbbreviations("P. Mariozzi", "M. Paolozzi"))
|
||||
}
|
||||
|
||||
@Test def testNonMatches(): Unit = {
|
||||
assertFalse(matchOrderedTokenAndAbbreviations("Giovanni Paolozzi", "Francesco Paolozzi"))
|
||||
assertFalse(matchOrderedTokenAndAbbreviations("G. Paolozzi", "F. Paolozzi"))
|
||||
}
|
||||
|
||||
@Test def testChineseNames(): Unit = {
|
||||
assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin"))
|
||||
// assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue