dnet-hadoop/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorEnricher.scala

package eu.dnetlib.dhp.enrich.orcid

import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.{Author, StructuredProperty}
import eu.dnetlib.dhp.schema.sx.OafUtils

import java.util
import scala.beans.BeanProperty
import scala.collection.JavaConverters._
import scala.util.control.Breaks.{break, breakable}

case class ORCIDAuthorEnricherResult(
  @BeanProperty var id: String,
  @BeanProperty var enriched_author: java.util.List[Author],
  @BeanProperty var author_matched: java.util.List[MatchedAuthors],
  @BeanProperty var author_unmatched: java.util.List[Author],
  @BeanProperty var orcid_unmatched: java.util.List[OrcidAutor]
)

object ORCIDAuthorEnricher extends Serializable {

  def enrichOrcid(
    id: String,
    graph_authors: java.util.List[Author],
    orcid_authors: java.util.List[OrcidAutor]
  ): ORCIDAuthorEnricherResult = {
    // Author enriching strategy:
    // 1) create a copy of graph author list in unmatched_authors
    // 2) find best match in unmatched_authors, remove it from unmatched_authors and enrich it so
    //     that the enrichment is reflected in  graph_authors (they share author instances)
    // 3) repeat (2) till the end of the list and then with different matching algorithms that have decreasing
    //    trust in their output
    // At the end unmatched_authors will contain authors not matched with any of the matching algos
    val unmatched_authors = new util.ArrayList[Author](graph_authors)

    val matches = {
      // Look after exact fullname match, reconstruct ORCID fullname as givenName + familyName
      extractAndEnrichMatches(
        unmatched_authors,
        orcid_authors,
        (author, orcid) =>
          ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.givenName + " " + orcid.familyName),
        "fullName"
      ) ++
      // Look after exact reversed fullname match, reconstruct ORCID fullname as familyName + givenName
      extractAndEnrichMatches(
        unmatched_authors,
        orcid_authors,
        (author, orcid) =>
          ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.familyName + " " + orcid.givenName),
        "reversedFullName"
      ) ++
      // split author names in tokens, order the tokens, then check for matches of full tokens or abbreviations
      extractAndEnrichMatches(
        unmatched_authors,
        orcid_authors,
        (author, orcid) =>
          ORCIDAuthorMatchers
            .matchOrderedTokenAndAbbreviations(author.getFullname, orcid.givenName + " " + orcid.familyName),
        "orderedTokens"
      ) ++
      // look after exact matches of ORCID creditName
      extractAndEnrichMatches(
        unmatched_authors,
        orcid_authors,
        (author, orcid) => ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.creditName),
        "creditName"
      ) ++
      // look after exact matches in  ORCID otherNames
      extractAndEnrichMatches(
        unmatched_authors,
        orcid_authors,
        (author, orcid) =>
          orcid.otherNames != null && ORCIDAuthorMatchers.matchOtherNames(author.getFullname, orcid.otherNames.asScala),
        "otherNames"
      )
    }

    ORCIDAuthorEnricherResult(id, graph_authors, matches.asJava, unmatched_authors, orcid_authors)
  }

  private def extractAndEnrichMatches(
    graph_authors: java.util.List[Author],
    orcid_authors: java.util.List[OrcidAutor],
    matchingFunc: (Author, OrcidAutor) => Boolean,
    matchName: String
  ) = {
    val matched = scala.collection.mutable.ArrayBuffer.empty[MatchedAuthors]

    if (graph_authors != null && !graph_authors.isEmpty) {
      val ait = graph_authors.iterator

      while (ait.hasNext) {
        val author = ait.next()
        val oit = orcid_authors.iterator

        breakable {
          while (oit.hasNext) {
            val orcid = oit.next()

            if (matchingFunc(author, orcid)) {
              ait.remove()
              oit.remove()
              matched += MatchedAuthors(author, orcid, matchName)

              if (author.getPid == null) {
                author.setPid(new util.ArrayList[StructuredProperty]())
              }

              val orcidPID = OafUtils.createSP(orcid.orcid, ModelConstants.ORCID, ModelConstants.ORCID)
              orcidPID.setDataInfo(OafUtils.generateDataInfo())
              orcidPID.getDataInfo.setProvenanceaction(
                OafUtils.createQualifier("ORCID_ENRICHMENT", "ORCID_ENRICHMENT")
              )

              author.getPid.add(orcidPID)

              break()
            }
          }
        }
      }
    }

    matched
  }

}