129 lines
4.5 KiB
Scala
129 lines
4.5 KiB
Scala
package eu.dnetlib.dhp.enrich.orcid
|
|
|
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
|
import eu.dnetlib.dhp.schema.oaf.{Author, StructuredProperty}
|
|
import eu.dnetlib.dhp.schema.sx.OafUtils
|
|
|
|
import java.util
|
|
import scala.beans.BeanProperty
|
|
import scala.collection.JavaConverters._
|
|
import scala.util.control.Breaks.{break, breakable}
|
|
|
|
case class ORCIDAuthorEnricherResult(
|
|
@BeanProperty var id: String,
|
|
@BeanProperty var enriched_author: java.util.List[Author],
|
|
@BeanProperty var author_matched: java.util.List[MatchedAuthors],
|
|
@BeanProperty var author_unmatched: java.util.List[Author],
|
|
@BeanProperty var orcid_unmatched: java.util.List[OrcidAutor]
|
|
)
|
|
|
|
object ORCIDAuthorEnricher extends Serializable {
|
|
|
|
def enrichOrcid(
|
|
id: String,
|
|
graph_authors: java.util.List[Author],
|
|
orcid_authors: java.util.List[OrcidAutor]
|
|
): ORCIDAuthorEnricherResult = {
|
|
// Author enriching strategy:
|
|
// 1) create a copy of graph author list in unmatched_authors
|
|
// 2) find best match in unmatched_authors, remove it from unmatched_authors and enrich it so
|
|
// that the enrichment is reflected in graph_authors (they share author instances)
|
|
// 3) repeat (2) till the end of the list and then with different matching algorithms that have decreasing
|
|
// trust in their output
|
|
// At the end unmatched_authors will contain authors not matched with any of the matching algos
|
|
val unmatched_authors = new util.ArrayList[Author](graph_authors)
|
|
|
|
val matches = {
|
|
// Look after exact fullname match, reconstruct ORCID fullname as givenName + familyName
|
|
extractAndEnrichMatches(
|
|
unmatched_authors,
|
|
orcid_authors,
|
|
(author, orcid) =>
|
|
ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.givenName + " " + orcid.familyName),
|
|
"fullName"
|
|
) ++
|
|
// Look after exact reversed fullname match, reconstruct ORCID fullname as familyName + givenName
|
|
extractAndEnrichMatches(
|
|
unmatched_authors,
|
|
orcid_authors,
|
|
(author, orcid) =>
|
|
ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.familyName + " " + orcid.givenName),
|
|
"reversedFullName"
|
|
) ++
|
|
// split author names in tokens, order the tokens, then check for matches of full tokens or abbreviations
|
|
extractAndEnrichMatches(
|
|
unmatched_authors,
|
|
orcid_authors,
|
|
(author, orcid) =>
|
|
ORCIDAuthorMatchers
|
|
.matchOrderedTokenAndAbbreviations(author.getFullname, orcid.givenName + " " + orcid.familyName),
|
|
"orderedTokens"
|
|
) ++
|
|
// look after exact matches of ORCID creditName
|
|
extractAndEnrichMatches(
|
|
unmatched_authors,
|
|
orcid_authors,
|
|
(author, orcid) => ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.creditName),
|
|
"creditName"
|
|
) ++
|
|
// look after exact matches in ORCID otherNames
|
|
extractAndEnrichMatches(
|
|
unmatched_authors,
|
|
orcid_authors,
|
|
(author, orcid) =>
|
|
orcid.otherNames != null && ORCIDAuthorMatchers.matchOtherNames(author.getFullname, orcid.otherNames.asScala),
|
|
"otherNames"
|
|
)
|
|
}
|
|
|
|
ORCIDAuthorEnricherResult(id, graph_authors, matches.asJava, unmatched_authors, orcid_authors)
|
|
}
|
|
|
|
private def extractAndEnrichMatches(
|
|
graph_authors: java.util.List[Author],
|
|
orcid_authors: java.util.List[OrcidAutor],
|
|
matchingFunc: (Author, OrcidAutor) => Boolean,
|
|
matchName: String
|
|
) = {
|
|
val matched = scala.collection.mutable.ArrayBuffer.empty[MatchedAuthors]
|
|
|
|
if (graph_authors != null && !graph_authors.isEmpty) {
|
|
val ait = graph_authors.iterator
|
|
|
|
while (ait.hasNext) {
|
|
val author = ait.next()
|
|
val oit = orcid_authors.iterator
|
|
|
|
breakable {
|
|
while (oit.hasNext) {
|
|
val orcid = oit.next()
|
|
|
|
if (matchingFunc(author, orcid)) {
|
|
ait.remove()
|
|
oit.remove()
|
|
matched += MatchedAuthors(author, orcid, matchName)
|
|
|
|
if (author.getPid == null) {
|
|
author.setPid(new util.ArrayList[StructuredProperty]())
|
|
}
|
|
|
|
val orcidPID = OafUtils.createSP(orcid.orcid, ModelConstants.ORCID, ModelConstants.ORCID)
|
|
orcidPID.setDataInfo(OafUtils.generateDataInfo())
|
|
orcidPID.getDataInfo.setProvenanceaction(
|
|
OafUtils.createQualifier("ORCID_ENRICHMENT", "ORCID_ENRICHMENT")
|
|
)
|
|
|
|
author.getPid.add(orcidPID)
|
|
|
|
break()
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
matched
|
|
}
|
|
|
|
}
|