[orcidenrichment] Do not match in case of ambiguity: two authors match and at least one of them has affiliation string
This commit is contained in:
parent
b9875f0095
commit
ce4036d6b2
|
@ -7,6 +7,7 @@ import eu.dnetlib.dhp.schema.sx.OafUtils
|
||||||
import java.util
|
import java.util
|
||||||
import scala.beans.BeanProperty
|
import scala.beans.BeanProperty
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
import scala.collection.mutable.ArrayBuffer
|
||||||
import scala.util.control.Breaks.{break, breakable}
|
import scala.util.control.Breaks.{break, breakable}
|
||||||
|
|
||||||
case class OrcidAuthor(
|
case class OrcidAuthor(
|
||||||
|
@ -20,9 +21,9 @@ case class OrcidAuthor(
|
||||||
}
|
}
|
||||||
|
|
||||||
case class MatchedAuthors(
|
case class MatchedAuthors(
|
||||||
@BeanProperty var author: Author,
|
@BeanProperty var author: Author,
|
||||||
@BeanProperty var orcid: OrcidAuthor,
|
@BeanProperty var orcid: OrcidAuthor,
|
||||||
@BeanProperty var `type`: String
|
@BeanProperty var `type`: String
|
||||||
)
|
)
|
||||||
|
|
||||||
case class MatchData(
|
case class MatchData(
|
||||||
|
@ -47,13 +48,14 @@ object ORCIDAuthorEnricher extends Serializable {
|
||||||
id: String,
|
id: String,
|
||||||
graph_authors: java.util.List[Author],
|
graph_authors: java.util.List[Author],
|
||||||
orcid_authors: java.util.List[OrcidAuthor],
|
orcid_authors: java.util.List[OrcidAuthor],
|
||||||
classid:String,
|
classid: String,
|
||||||
provenance:String
|
provenance: String
|
||||||
): ORCIDAuthorEnricherResult = {
|
): ORCIDAuthorEnricherResult = {
|
||||||
// Author enriching strategy:
|
// Author enriching strategy:
|
||||||
// 1) create a copy of graph author list in unmatched_authors
|
// 1) create a copy of graph author list in unmatched_authors
|
||||||
// 2) find best match in unmatched_authors, remove it from unmatched_authors and enrich it so
|
// 2) find best match in unmatched_authors, remove it from unmatched_authors and enrich it so
|
||||||
// that the enrichment is reflected in graph_authors (they share author instances)
|
// that the enrichment is reflected in graph_authors (they share author instances).
|
||||||
|
// Do not match in case of ambiguity: two authors match and at least one of them has affiliation string
|
||||||
// 3) repeat (2) till the end of the list and then with different matching algorithms that have decreasing
|
// 3) repeat (2) till the end of the list and then with different matching algorithms that have decreasing
|
||||||
// trust in their output
|
// trust in their output
|
||||||
// At the end unmatched_authors will contain authors not matched with any of the matching algos
|
// At the end unmatched_authors will contain authors not matched with any of the matching algos
|
||||||
|
@ -87,7 +89,19 @@ object ORCIDAuthorEnricher extends Serializable {
|
||||||
(author, orcid) =>
|
(author, orcid) =>
|
||||||
AuthorMatchers
|
AuthorMatchers
|
||||||
.matchOrderedTokenAndAbbreviations(author.getFullname, orcid.givenName + " " + orcid.familyName),
|
.matchOrderedTokenAndAbbreviations(author.getFullname, orcid.givenName + " " + orcid.familyName),
|
||||||
"orderedTokens",
|
"orderedTokens-1",
|
||||||
|
classid,
|
||||||
|
provenance,
|
||||||
|
skipAmbiguities = true
|
||||||
|
) ++
|
||||||
|
// split author names in tokens, order the tokens, then check for matches of full tokens or abbreviations
|
||||||
|
extractAndEnrichMatches(
|
||||||
|
unmatched_authors,
|
||||||
|
orcid_authors,
|
||||||
|
(author, orcid) =>
|
||||||
|
AuthorMatchers
|
||||||
|
.matchOrderedTokenAndAbbreviations(author.getFullname, orcid.givenName + " " + orcid.familyName),
|
||||||
|
"orderedTokens-2",
|
||||||
classid,
|
classid,
|
||||||
provenance
|
provenance
|
||||||
) ++
|
) ++
|
||||||
|
@ -116,50 +130,54 @@ object ORCIDAuthorEnricher extends Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
private def extractAndEnrichMatches(
|
private def extractAndEnrichMatches(
|
||||||
graph_authors: java.util.List[Author],
|
unmatched_authors: java.util.List[Author],
|
||||||
orcid_authors: java.util.List[OrcidAuthor],
|
orcid_authors: java.util.List[OrcidAuthor],
|
||||||
matchingFunc: (Author, OrcidAuthor) => Boolean,
|
matchingFunc: (Author, OrcidAuthor) => Boolean,
|
||||||
matchName: String,
|
matchName: String,
|
||||||
classid:String,
|
classid: String,
|
||||||
provenance : String
|
provenance: String,
|
||||||
) = {
|
skipAmbiguities: Boolean = false
|
||||||
val matched = scala.collection.mutable.ArrayBuffer.empty[MatchedAuthors]
|
): ArrayBuffer[MatchedAuthors] = {
|
||||||
|
val matched = ArrayBuffer.empty[MatchedAuthors]
|
||||||
|
|
||||||
if (graph_authors != null && !graph_authors.isEmpty) {
|
if (unmatched_authors == null || unmatched_authors.isEmpty) {
|
||||||
val ait = graph_authors.iterator
|
return matched
|
||||||
|
}
|
||||||
|
|
||||||
while (ait.hasNext) {
|
val oit = orcid_authors.iterator
|
||||||
val author = ait.next()
|
while (oit.hasNext) {
|
||||||
val oit = orcid_authors.iterator
|
val orcid = oit.next()
|
||||||
|
val candidates = unmatched_authors.asScala.foldLeft(ArrayBuffer[Author]())((res, author) => {
|
||||||
breakable {
|
if (matchingFunc(author, orcid)) {
|
||||||
while (oit.hasNext) {
|
res += author
|
||||||
val orcid = oit.next()
|
|
||||||
|
|
||||||
if (matchingFunc(author, orcid)) {
|
|
||||||
ait.remove()
|
|
||||||
oit.remove()
|
|
||||||
matched += MatchedAuthors(author, orcid, matchName)
|
|
||||||
|
|
||||||
if (author.getPid == null) {
|
|
||||||
author.setPid(new util.ArrayList[StructuredProperty]())
|
|
||||||
}
|
|
||||||
|
|
||||||
val orcidPID = OafUtils.createSP(orcid.orcid, classid, classid)
|
|
||||||
//val orcidPID = OafUtils.createSP(orcid.orcid, ModelConstants.ORCID, ModelConstants.ORCID)
|
|
||||||
orcidPID.setDataInfo(OafUtils.generateDataInfo())
|
|
||||||
orcidPID.getDataInfo.setProvenanceaction(
|
|
||||||
//OafUtils.createQualifier("ORCID_ENRICHMENT", "ORCID_ENRICHMENT")
|
|
||||||
OafUtils.createQualifier(provenance, provenance)
|
|
||||||
)
|
|
||||||
|
|
||||||
author.getPid.add(orcidPID)
|
|
||||||
|
|
||||||
break()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
res
|
||||||
|
})
|
||||||
|
|
||||||
|
if (
|
||||||
|
candidates.size == 1 ||
|
||||||
|
(candidates.size > 1 && !skipAmbiguities && !candidates
|
||||||
|
.exists(a => a.getRawAffiliationString != null && !a.getRawAffiliationString.isEmpty))
|
||||||
|
) {
|
||||||
|
val author = candidates(0)
|
||||||
|
unmatched_authors.remove(author)
|
||||||
|
oit.remove()
|
||||||
|
matched += MatchedAuthors(author, orcid, matchName)
|
||||||
|
|
||||||
|
if (author.getPid == null) {
|
||||||
|
author.setPid(new util.ArrayList[StructuredProperty]())
|
||||||
|
}
|
||||||
|
|
||||||
|
val orcidPID = OafUtils.createSP(orcid.orcid, classid, classid)
|
||||||
|
orcidPID.setDataInfo(OafUtils.generateDataInfo())
|
||||||
|
orcidPID.getDataInfo.setProvenanceaction(
|
||||||
|
OafUtils.createQualifier(provenance, provenance)
|
||||||
|
)
|
||||||
|
|
||||||
|
author.getPid.add(orcidPID)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
matched
|
matched
|
||||||
|
|
Loading…
Reference in New Issue