[orcidenrichment] Do not match in case of ambiguity: two authors match and at least one of them has affiliation string

2024-11-21 16:17:08 +01:00 · 2024-11-21 16:17:08 +01:00 · ce4036d6b2
parent b9875f0095
commit ce4036d6b2
1 changed files with 65 additions and 47 deletions
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/utils/ORCIDAuthorEnricher.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/utils/ORCIDAuthorEnricher.scala
@ -7,6 +7,7 @@ import eu.dnetlib.dhp.schema.sx.OafUtils
 import java.util
 import scala.beans.BeanProperty
 import scala.collection.JavaConverters._
+import scala.collection.mutable.ArrayBuffer
 import scala.util.control.Breaks.{break, breakable}

 case class OrcidAuthor(
@ -53,7 +54,8 @@ object ORCIDAuthorEnricher extends Serializable {
    // Author enriching strategy:
    // 1) create a copy of graph author list in unmatched_authors
    // 2) find best match in unmatched_authors, remove it from unmatched_authors and enrich it so
-    //     that the enrichment is reflected in  graph_authors (they share author instances)
+    //     that the enrichment is reflected in  graph_authors (they share author instances).
+    //     Do not match in case of ambiguity: two authors match and at least one of them has affiliation string
    // 3) repeat (2) till the end of the list and then with different matching algorithms that have decreasing
    //    trust in their output
    // At the end unmatched_authors will contain authors not matched with any of the matching algos
@ -87,7 +89,19 @@ object ORCIDAuthorEnricher extends Serializable {
        (author, orcid) =>
          AuthorMatchers
            .matchOrderedTokenAndAbbreviations(author.getFullname, orcid.givenName + " " + orcid.familyName),
-        "orderedTokens",
+        "orderedTokens-1",
+        classid,
+        provenance,
+        skipAmbiguities = true
+      ) ++
+      // split author names in tokens, order the tokens, then check for matches of full tokens or abbreviations
+      extractAndEnrichMatches(
+        unmatched_authors,
+        orcid_authors,
+        (author, orcid) =>
+          AuthorMatchers
+            .matchOrderedTokenAndAbbreviations(author.getFullname, orcid.givenName + " " + orcid.familyName),
+        "orderedTokens-2",
        classid,
        provenance
      ) ++
@ -116,28 +130,38 @@ object ORCIDAuthorEnricher extends Serializable {
  }

  private def extractAndEnrichMatches(
-                                       graph_authors: java.util.List[Author],
+    unmatched_authors: java.util.List[Author],
    orcid_authors: java.util.List[OrcidAuthor],
    matchingFunc: (Author, OrcidAuthor) => Boolean,
    matchName: String,
    classid: String,
-                                       provenance : String
-  ) = {
-    val matched = scala.collection.mutable.ArrayBuffer.empty[MatchedAuthors]
+    provenance: String,
+    skipAmbiguities: Boolean = false
+  ): ArrayBuffer[MatchedAuthors] = {
+    val matched = ArrayBuffer.empty[MatchedAuthors]

-    if (graph_authors != null && !graph_authors.isEmpty) {
-      val ait = graph_authors.iterator
+    if (unmatched_authors == null || unmatched_authors.isEmpty) {
+      return matched
+    }

-      while (ait.hasNext) {
-        val author = ait.next()
    val oit = orcid_authors.iterator
-
-        breakable {
    while (oit.hasNext) {
      val orcid = oit.next()
-
+      val candidates = unmatched_authors.asScala.foldLeft(ArrayBuffer[Author]())((res, author) => {
        if (matchingFunc(author, orcid)) {
-              ait.remove()
+          res += author
+        }
+
+        res
+      })
+
+      if (
+        candidates.size == 1 ||
+        (candidates.size > 1 && !skipAmbiguities && !candidates
+          .exists(a => a.getRawAffiliationString != null && !a.getRawAffiliationString.isEmpty))
+      ) {
+        val author = candidates(0)
+        unmatched_authors.remove(author)
        oit.remove()
        matched += MatchedAuthors(author, orcid, matchName)

@ -146,20 +170,14 @@ object ORCIDAuthorEnricher extends Serializable {
        }

        val orcidPID = OafUtils.createSP(orcid.orcid, classid, classid)
-             //val orcidPID = OafUtils.createSP(orcid.orcid, ModelConstants.ORCID, ModelConstants.ORCID)
        orcidPID.setDataInfo(OafUtils.generateDataInfo())
        orcidPID.getDataInfo.setProvenanceaction(
-                //OafUtils.createQualifier("ORCID_ENRICHMENT", "ORCID_ENRICHMENT")
          OafUtils.createQualifier(provenance, provenance)
        )

        author.getPid.add(orcidPID)
+      }

-              break()
-            }
-          }
-        }
-      }
    }

    matched