[orcidenrichment] Do not match in case of ambiguity: two authors match and at least one of them has affiliation string

This commit is contained in:
Giambattista Bloisi 2024-11-21 16:17:08 +01:00
parent b9875f0095
commit ce4036d6b2
1 changed files with 65 additions and 47 deletions

View File

@ -7,6 +7,7 @@ import eu.dnetlib.dhp.schema.sx.OafUtils
import java.util import java.util
import scala.beans.BeanProperty import scala.beans.BeanProperty
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer
import scala.util.control.Breaks.{break, breakable} import scala.util.control.Breaks.{break, breakable}
case class OrcidAuthor( case class OrcidAuthor(
@ -20,9 +21,9 @@ case class OrcidAuthor(
} }
case class MatchedAuthors( case class MatchedAuthors(
@BeanProperty var author: Author, @BeanProperty var author: Author,
@BeanProperty var orcid: OrcidAuthor, @BeanProperty var orcid: OrcidAuthor,
@BeanProperty var `type`: String @BeanProperty var `type`: String
) )
case class MatchData( case class MatchData(
@ -47,13 +48,14 @@ object ORCIDAuthorEnricher extends Serializable {
id: String, id: String,
graph_authors: java.util.List[Author], graph_authors: java.util.List[Author],
orcid_authors: java.util.List[OrcidAuthor], orcid_authors: java.util.List[OrcidAuthor],
classid:String, classid: String,
provenance:String provenance: String
): ORCIDAuthorEnricherResult = { ): ORCIDAuthorEnricherResult = {
// Author enriching strategy: // Author enriching strategy:
// 1) create a copy of graph author list in unmatched_authors // 1) create a copy of graph author list in unmatched_authors
// 2) find best match in unmatched_authors, remove it from unmatched_authors and enrich it so // 2) find best match in unmatched_authors, remove it from unmatched_authors and enrich it so
// that the enrichment is reflected in graph_authors (they share author instances) // that the enrichment is reflected in graph_authors (they share author instances).
// Do not match in case of ambiguity: two authors match and at least one of them has affiliation string
// 3) repeat (2) till the end of the list and then with different matching algorithms that have decreasing // 3) repeat (2) till the end of the list and then with different matching algorithms that have decreasing
// trust in their output // trust in their output
// At the end unmatched_authors will contain authors not matched with any of the matching algos // At the end unmatched_authors will contain authors not matched with any of the matching algos
@ -87,7 +89,19 @@ object ORCIDAuthorEnricher extends Serializable {
(author, orcid) => (author, orcid) =>
AuthorMatchers AuthorMatchers
.matchOrderedTokenAndAbbreviations(author.getFullname, orcid.givenName + " " + orcid.familyName), .matchOrderedTokenAndAbbreviations(author.getFullname, orcid.givenName + " " + orcid.familyName),
"orderedTokens", "orderedTokens-1",
classid,
provenance,
skipAmbiguities = true
) ++
// split author names in tokens, order the tokens, then check for matches of full tokens or abbreviations
extractAndEnrichMatches(
unmatched_authors,
orcid_authors,
(author, orcid) =>
AuthorMatchers
.matchOrderedTokenAndAbbreviations(author.getFullname, orcid.givenName + " " + orcid.familyName),
"orderedTokens-2",
classid, classid,
provenance provenance
) ++ ) ++
@ -116,50 +130,54 @@ object ORCIDAuthorEnricher extends Serializable {
} }
private def extractAndEnrichMatches( private def extractAndEnrichMatches(
graph_authors: java.util.List[Author], unmatched_authors: java.util.List[Author],
orcid_authors: java.util.List[OrcidAuthor], orcid_authors: java.util.List[OrcidAuthor],
matchingFunc: (Author, OrcidAuthor) => Boolean, matchingFunc: (Author, OrcidAuthor) => Boolean,
matchName: String, matchName: String,
classid:String, classid: String,
provenance : String provenance: String,
) = { skipAmbiguities: Boolean = false
val matched = scala.collection.mutable.ArrayBuffer.empty[MatchedAuthors] ): ArrayBuffer[MatchedAuthors] = {
val matched = ArrayBuffer.empty[MatchedAuthors]
if (graph_authors != null && !graph_authors.isEmpty) { if (unmatched_authors == null || unmatched_authors.isEmpty) {
val ait = graph_authors.iterator return matched
}
while (ait.hasNext) { val oit = orcid_authors.iterator
val author = ait.next() while (oit.hasNext) {
val oit = orcid_authors.iterator val orcid = oit.next()
val candidates = unmatched_authors.asScala.foldLeft(ArrayBuffer[Author]())((res, author) => {
breakable { if (matchingFunc(author, orcid)) {
while (oit.hasNext) { res += author
val orcid = oit.next()
if (matchingFunc(author, orcid)) {
ait.remove()
oit.remove()
matched += MatchedAuthors(author, orcid, matchName)
if (author.getPid == null) {
author.setPid(new util.ArrayList[StructuredProperty]())
}
val orcidPID = OafUtils.createSP(orcid.orcid, classid, classid)
//val orcidPID = OafUtils.createSP(orcid.orcid, ModelConstants.ORCID, ModelConstants.ORCID)
orcidPID.setDataInfo(OafUtils.generateDataInfo())
orcidPID.getDataInfo.setProvenanceaction(
//OafUtils.createQualifier("ORCID_ENRICHMENT", "ORCID_ENRICHMENT")
OafUtils.createQualifier(provenance, provenance)
)
author.getPid.add(orcidPID)
break()
}
}
} }
res
})
if (
candidates.size == 1 ||
(candidates.size > 1 && !skipAmbiguities && !candidates
.exists(a => a.getRawAffiliationString != null && !a.getRawAffiliationString.isEmpty))
) {
val author = candidates(0)
unmatched_authors.remove(author)
oit.remove()
matched += MatchedAuthors(author, orcid, matchName)
if (author.getPid == null) {
author.setPid(new util.ArrayList[StructuredProperty]())
}
val orcidPID = OafUtils.createSP(orcid.orcid, classid, classid)
orcidPID.setDataInfo(OafUtils.generateDataInfo())
orcidPID.getDataInfo.setProvenanceaction(
OafUtils.createQualifier(provenance, provenance)
)
author.getPid.add(orcidPID)
} }
} }
matched matched