From 36ca0b123e1b2fbf6ea8fdd7cf8989357dde85ef Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Wed, 30 Oct 2024 15:23:05 +0100 Subject: [PATCH] Move AuthorMatchers in dhp-common --- .../dnetlib/dhp/utils}/AuthorMatchers.scala | 21 +++++----- .../dhp/utils}/ORCIDAuthorEnricher.scala | 39 +++++++++++++++---- .../dhp/utils/AuthorMatchersTest.scala | 6 +-- .../eu/dnetlib/pace/tree/AuthorsMatch.java | 2 +- .../SparkEnrichGraphWithOrcidAuthors.scala | 29 +------------- 5 files changed, 47 insertions(+), 50 deletions(-) rename {dhp-pace-core/src/main/java/eu/dnetlib/pace/util => dhp-common/src/main/scala/eu/dnetlib/dhp/utils}/AuthorMatchers.scala (81%) rename {dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid => dhp-common/src/main/scala/eu/dnetlib/dhp/utils}/ORCIDAuthorEnricher.scala (78%) rename dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala => dhp-common/src/test/java/eu/dnetlib/dhp/utils/AuthorMatchersTest.scala (91%) diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/AuthorMatchers.scala b/dhp-common/src/main/scala/eu/dnetlib/dhp/utils/AuthorMatchers.scala similarity index 81% rename from dhp-pace-core/src/main/java/eu/dnetlib/pace/util/AuthorMatchers.scala rename to dhp-common/src/main/scala/eu/dnetlib/dhp/utils/AuthorMatchers.scala index 116f515ed..5f842726f 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/AuthorMatchers.scala +++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/utils/AuthorMatchers.scala @@ -1,4 +1,4 @@ -package eu.dnetlib.pace.util +package eu.dnetlib.dhp.utils import java.util.Locale import java.util.regex.Pattern @@ -66,19 +66,18 @@ object AuthorMatchers { } def removeMatches( - graph_authors: java.util.List[String], - orcid_authors: java.util.List[String], - matchingFunc: java.util.function.BiFunction[String,String,Boolean] - ) : java.util.List[String] = { - removeMatches(graph_authors, orcid_authors, (a, b) => matchingFunc(a,b)) + graph_authors: java.util.List[String], + orcid_authors: java.util.List[String], + matchingFunc: java.util.function.BiFunction[String, String, Boolean] + ): java.util.List[String] = { + removeMatches(graph_authors, orcid_authors, (a, b) => matchingFunc(a, b)) } - def removeMatches( - graph_authors: java.util.List[String], - orcid_authors: java.util.List[String], - matchingFunc: (String, String) => Boolean - ) : java.util.List[String] = { + graph_authors: java.util.List[String], + orcid_authors: java.util.List[String], + matchingFunc: (String, String) => Boolean + ): java.util.List[String] = { val matched = new java.util.ArrayList[String]() if (graph_authors != null && !graph_authors.isEmpty) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorEnricher.scala b/dhp-common/src/main/scala/eu/dnetlib/dhp/utils/ORCIDAuthorEnricher.scala similarity index 78% rename from dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorEnricher.scala rename to dhp-common/src/main/scala/eu/dnetlib/dhp/utils/ORCIDAuthorEnricher.scala index 2e23a3a59..6a087b1de 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorEnricher.scala +++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/utils/ORCIDAuthorEnricher.scala @@ -1,21 +1,44 @@ -package eu.dnetlib.dhp.enrich.orcid +package eu.dnetlib.dhp.utils import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.{Author, StructuredProperty} import eu.dnetlib.dhp.schema.sx.OafUtils -import eu.dnetlib.pace.util.AuthorMatchers import java.util import scala.beans.BeanProperty import scala.collection.JavaConverters._ import scala.util.control.Breaks.{break, breakable} +case class OrcidAuthor( + @BeanProperty var orcid: String, + @BeanProperty var familyName: String, + @BeanProperty var givenName: String, + @BeanProperty var creditName: String, + @BeanProperty var otherNames: java.util.List[String] +) { + def this() = this("null", "null", "null", "null", null) +} + +case class MatchedAuthors( + @BeanProperty var author: Author, + @BeanProperty var orcid: OrcidAuthor, + @BeanProperty var `type`: String +) + +case class MatchData( + @BeanProperty var id: String, + @BeanProperty var graph_authors: java.util.List[Author], + @BeanProperty var orcid_authors: java.util.List[OrcidAuthor] +) { + def this() = this("null", null, null) +} + case class ORCIDAuthorEnricherResult( @BeanProperty var id: String, @BeanProperty var enriched_author: java.util.List[Author], @BeanProperty var author_matched: java.util.List[MatchedAuthors], @BeanProperty var author_unmatched: java.util.List[Author], - @BeanProperty var orcid_unmatched: java.util.List[OrcidAutor] + @BeanProperty var orcid_unmatched: java.util.List[OrcidAuthor] ) object ORCIDAuthorEnricher extends Serializable { @@ -23,7 +46,7 @@ object ORCIDAuthorEnricher extends Serializable { def enrichOrcid( id: String, graph_authors: java.util.List[Author], - orcid_authors: java.util.List[OrcidAutor] + orcid_authors: java.util.List[OrcidAuthor] ): ORCIDAuthorEnricherResult = { // Author enriching strategy: // 1) create a copy of graph author list in unmatched_authors @@ -81,10 +104,10 @@ object ORCIDAuthorEnricher extends Serializable { } private def extractAndEnrichMatches( - graph_authors: java.util.List[Author], - orcid_authors: java.util.List[OrcidAutor], - matchingFunc: (Author, OrcidAutor) => Boolean, - matchName: String + graph_authors: java.util.List[Author], + orcid_authors: java.util.List[OrcidAuthor], + matchingFunc: (Author, OrcidAuthor) => Boolean, + matchName: String ) = { val matched = scala.collection.mutable.ArrayBuffer.empty[MatchedAuthors] diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala b/dhp-common/src/test/java/eu/dnetlib/dhp/utils/AuthorMatchersTest.scala similarity index 91% rename from dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala rename to dhp-common/src/test/java/eu/dnetlib/dhp/utils/AuthorMatchersTest.scala index eece56b74..fc9dcd1da 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/utils/AuthorMatchersTest.scala @@ -1,10 +1,10 @@ -package eu.dnetlib.dhp.enrich.orcid +package eu.dnetlib.dhp.utils -import eu.dnetlib.pace.util.AuthorMatchers.matchOrderedTokenAndAbbreviations +import eu.dnetlib.dhp.utils.AuthorMatchers.matchOrderedTokenAndAbbreviations import org.junit.jupiter.api.Assertions.{assertFalse, assertTrue} import org.junit.jupiter.api.Test -class ORCIDAuthorMatchersTest { +class AuthorMatchersTest { @Test def testShortNames(): Unit = { assertTrue(matchOrderedTokenAndAbbreviations("Lasagni Mariozzi Federico", "Lasagni F. Mariozzi")) diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java index 07080b09e..dcf9241ed 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java @@ -9,11 +9,11 @@ import java.util.stream.Collectors; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.dhp.utils.AuthorMatchers; import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Person; import eu.dnetlib.pace.tree.support.AbstractListComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import eu.dnetlib.pace.util.AuthorMatchers; @ComparatorClass("authorsMatch") public class AuthorsMatch extends AbstractListComparator { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala index 847a5f090..e615f02d8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala @@ -2,38 +2,13 @@ package eu.dnetlib.dhp.enrich.orcid import eu.dnetlib.dhp.application.AbstractScalaApplication import eu.dnetlib.dhp.schema.common.ModelSupport -import eu.dnetlib.dhp.schema.oaf._ +import eu.dnetlib.dhp.utils.{MatchData, ORCIDAuthorEnricher, ORCIDAuthorEnricherResult} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.slf4j.{Logger, LoggerFactory} -import scala.beans.BeanProperty import scala.collection.JavaConverters._ -case class OrcidAutor( - @BeanProperty var orcid: String, - @BeanProperty var familyName: String, - @BeanProperty var givenName: String, - @BeanProperty var creditName: String, - @BeanProperty var otherNames: java.util.List[String] -) { - def this() = this("null", "null", "null", "null", null) -} - -case class MatchData( - @BeanProperty var id: String, - @BeanProperty var graph_authors: java.util.List[Author], - @BeanProperty var orcid_authors: java.util.List[OrcidAutor] -) { - def this() = this("null", null, null) -} - -case class MatchedAuthors( - @BeanProperty var author: Author, - @BeanProperty var orcid: OrcidAutor, - @BeanProperty var `type`: String -) - class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String], log: Logger) extends AbstractScalaApplication(propertyPath, args, log: Logger) { @@ -87,7 +62,7 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String] } - private def createTemporaryData(graphPath: String, orcidPath: String, targetPath: String): Unit = { + def createTemporaryData(graphPath: String, orcidPath: String, targetPath: String): Unit = { val orcidAuthors = spark.read.load(s"$orcidPath/Authors").select("orcid", "familyName", "givenName", "creditName", "otherNames")