Move AuthorMatchers in dhp-common

2024-10-30 15:23:05 +01:00 · 2024-10-30 15:23:05 +01:00 · 36ca0b123e
parent dccbcfd36c
commit 36ca0b123e
5 changed files with 47 additions and 50 deletions
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/AuthorMatchers.scala
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/AuthorMatchers.scala
@ -1,4 +1,4 @@
-package eu.dnetlib.pace.util
+package eu.dnetlib.dhp.utils

 import java.util.Locale
 import java.util.regex.Pattern
@ -66,19 +66,18 @@ object AuthorMatchers {
  }

  def removeMatches(
-                     graph_authors: java.util.List[String],
-                     orcid_authors: java.util.List[String],
-                     matchingFunc: java.util.function.BiFunction[String,String,Boolean]
-                   ) : java.util.List[String] = {
-    removeMatches(graph_authors, orcid_authors, (a, b) => matchingFunc(a,b))
+    graph_authors: java.util.List[String],
+    orcid_authors: java.util.List[String],
+    matchingFunc: java.util.function.BiFunction[String, String, Boolean]
+  ): java.util.List[String] = {
+    removeMatches(graph_authors, orcid_authors, (a, b) => matchingFunc(a, b))
  }

-
  def removeMatches(
-                                       graph_authors: java.util.List[String],
-                                       orcid_authors: java.util.List[String],
-                                       matchingFunc: (String, String) => Boolean
-                                     ) : java.util.List[String]  = {
+    graph_authors: java.util.List[String],
+    orcid_authors: java.util.List[String],
+    matchingFunc: (String, String) => Boolean
+  ): java.util.List[String] = {
    val matched = new java.util.ArrayList[String]()

    if (graph_authors != null && !graph_authors.isEmpty) {
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorEnricher.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorEnricher.scala
@ -1,21 +1,44 @@
-package eu.dnetlib.dhp.enrich.orcid
+package eu.dnetlib.dhp.utils

 import eu.dnetlib.dhp.schema.common.ModelConstants
 import eu.dnetlib.dhp.schema.oaf.{Author, StructuredProperty}
 import eu.dnetlib.dhp.schema.sx.OafUtils
-import eu.dnetlib.pace.util.AuthorMatchers

 import java.util
 import scala.beans.BeanProperty
 import scala.collection.JavaConverters._
 import scala.util.control.Breaks.{break, breakable}

+case class OrcidAuthor(
+  @BeanProperty var orcid: String,
+  @BeanProperty var familyName: String,
+  @BeanProperty var givenName: String,
+  @BeanProperty var creditName: String,
+  @BeanProperty var otherNames: java.util.List[String]
+) {
+  def this() = this("null", "null", "null", "null", null)
+}
+
+case class MatchedAuthors(
+                           @BeanProperty var author: Author,
+                           @BeanProperty var orcid: OrcidAuthor,
+                           @BeanProperty var `type`: String
+)
+
+case class MatchData(
+  @BeanProperty var id: String,
+  @BeanProperty var graph_authors: java.util.List[Author],
+  @BeanProperty var orcid_authors: java.util.List[OrcidAuthor]
+) {
+  def this() = this("null", null, null)
+}
+
 case class ORCIDAuthorEnricherResult(
  @BeanProperty var id: String,
  @BeanProperty var enriched_author: java.util.List[Author],
  @BeanProperty var author_matched: java.util.List[MatchedAuthors],
  @BeanProperty var author_unmatched: java.util.List[Author],
-  @BeanProperty var orcid_unmatched: java.util.List[OrcidAutor]
+  @BeanProperty var orcid_unmatched: java.util.List[OrcidAuthor]
 )

 object ORCIDAuthorEnricher extends Serializable {
@ -23,7 +46,7 @@ object ORCIDAuthorEnricher extends Serializable {
  def enrichOrcid(
    id: String,
    graph_authors: java.util.List[Author],
-    orcid_authors: java.util.List[OrcidAutor]
+    orcid_authors: java.util.List[OrcidAuthor]
  ): ORCIDAuthorEnricherResult = {
    // Author enriching strategy:
    // 1) create a copy of graph author list in unmatched_authors
@ -81,10 +104,10 @@ object ORCIDAuthorEnricher extends Serializable {
  }

  private def extractAndEnrichMatches(
-    graph_authors: java.util.List[Author],
-    orcid_authors: java.util.List[OrcidAutor],
-    matchingFunc: (Author, OrcidAutor) => Boolean,
-    matchName: String
+                                       graph_authors: java.util.List[Author],
+                                       orcid_authors: java.util.List[OrcidAuthor],
+                                       matchingFunc: (Author, OrcidAuthor) => Boolean,
+                                       matchName: String
  ) = {
    val matched = scala.collection.mutable.ArrayBuffer.empty[MatchedAuthors]

--- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala
@ -1,10 +1,10 @@
-package eu.dnetlib.dhp.enrich.orcid
+package eu.dnetlib.dhp.utils

-import eu.dnetlib.pace.util.AuthorMatchers.matchOrderedTokenAndAbbreviations
+import eu.dnetlib.dhp.utils.AuthorMatchers.matchOrderedTokenAndAbbreviations
 import org.junit.jupiter.api.Assertions.{assertFalse, assertTrue}
 import org.junit.jupiter.api.Test

-class ORCIDAuthorMatchersTest {
+class AuthorMatchersTest {

  @Test def testShortNames(): Unit = {
    assertTrue(matchOrderedTokenAndAbbreviations("Lasagni Mariozzi Federico", "Lasagni F. Mariozzi"))
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
@ -9,11 +9,11 @@ import java.util.stream.Collectors;

 import com.wcohen.ss.AbstractStringDistance;

+import eu.dnetlib.dhp.utils.AuthorMatchers;
 import eu.dnetlib.pace.config.Config;
 import eu.dnetlib.pace.model.Person;
 import eu.dnetlib.pace.tree.support.AbstractListComparator;
 import eu.dnetlib.pace.tree.support.ComparatorClass;
-import eu.dnetlib.pace.util.AuthorMatchers;

@ComparatorClass("authorsMatch")
 public class AuthorsMatch extends AbstractListComparator {
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala
@ -2,38 +2,13 @@ package eu.dnetlib.dhp.enrich.orcid

 import eu.dnetlib.dhp.application.AbstractScalaApplication
 import eu.dnetlib.dhp.schema.common.ModelSupport
-import eu.dnetlib.dhp.schema.oaf._
+import eu.dnetlib.dhp.utils.{MatchData, ORCIDAuthorEnricher, ORCIDAuthorEnricherResult}
 import org.apache.spark.sql._
 import org.apache.spark.sql.functions._
 import org.slf4j.{Logger, LoggerFactory}

-import scala.beans.BeanProperty
 import scala.collection.JavaConverters._

-case class OrcidAutor(
-  @BeanProperty var orcid: String,
-  @BeanProperty var familyName: String,
-  @BeanProperty var givenName: String,
-  @BeanProperty var creditName: String,
-  @BeanProperty var otherNames: java.util.List[String]
-) {
-  def this() = this("null", "null", "null", "null", null)
-}
-
-case class MatchData(
-  @BeanProperty var id: String,
-  @BeanProperty var graph_authors: java.util.List[Author],
-  @BeanProperty var orcid_authors: java.util.List[OrcidAutor]
-) {
-  def this() = this("null", null, null)
-}
-
-case class MatchedAuthors(
-  @BeanProperty var author: Author,
-  @BeanProperty var orcid: OrcidAutor,
-  @BeanProperty var `type`: String
-)
-
 class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String], log: Logger)
    extends AbstractScalaApplication(propertyPath, args, log: Logger) {

@ -87,7 +62,7 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]

  }

-  private def createTemporaryData(graphPath: String, orcidPath: String, targetPath: String): Unit = {
+  def createTemporaryData(graphPath: String, orcidPath: String, targetPath: String): Unit = {
    val orcidAuthors =
      spark.read.load(s"$orcidPath/Authors").select("orcid", "familyName", "givenName", "creditName", "otherNames")