Move AuthorMatchers in dhp-common

This commit is contained in:
Giambattista Bloisi 2024-10-30 15:23:05 +01:00
parent dccbcfd36c
commit 36ca0b123e
5 changed files with 47 additions and 50 deletions

View File

@ -1,4 +1,4 @@
package eu.dnetlib.pace.util package eu.dnetlib.dhp.utils
import java.util.Locale import java.util.Locale
import java.util.regex.Pattern import java.util.regex.Pattern
@ -66,19 +66,18 @@ object AuthorMatchers {
} }
def removeMatches( def removeMatches(
graph_authors: java.util.List[String], graph_authors: java.util.List[String],
orcid_authors: java.util.List[String], orcid_authors: java.util.List[String],
matchingFunc: java.util.function.BiFunction[String,String,Boolean] matchingFunc: java.util.function.BiFunction[String, String, Boolean]
) : java.util.List[String] = { ): java.util.List[String] = {
removeMatches(graph_authors, orcid_authors, (a, b) => matchingFunc(a,b)) removeMatches(graph_authors, orcid_authors, (a, b) => matchingFunc(a, b))
} }
def removeMatches( def removeMatches(
graph_authors: java.util.List[String], graph_authors: java.util.List[String],
orcid_authors: java.util.List[String], orcid_authors: java.util.List[String],
matchingFunc: (String, String) => Boolean matchingFunc: (String, String) => Boolean
) : java.util.List[String] = { ): java.util.List[String] = {
val matched = new java.util.ArrayList[String]() val matched = new java.util.ArrayList[String]()
if (graph_authors != null && !graph_authors.isEmpty) { if (graph_authors != null && !graph_authors.isEmpty) {

View File

@ -1,21 +1,44 @@
package eu.dnetlib.dhp.enrich.orcid package eu.dnetlib.dhp.utils
import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.{Author, StructuredProperty} import eu.dnetlib.dhp.schema.oaf.{Author, StructuredProperty}
import eu.dnetlib.dhp.schema.sx.OafUtils import eu.dnetlib.dhp.schema.sx.OafUtils
import eu.dnetlib.pace.util.AuthorMatchers
import java.util import java.util
import scala.beans.BeanProperty import scala.beans.BeanProperty
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import scala.util.control.Breaks.{break, breakable} import scala.util.control.Breaks.{break, breakable}
case class OrcidAuthor(
@BeanProperty var orcid: String,
@BeanProperty var familyName: String,
@BeanProperty var givenName: String,
@BeanProperty var creditName: String,
@BeanProperty var otherNames: java.util.List[String]
) {
def this() = this("null", "null", "null", "null", null)
}
case class MatchedAuthors(
@BeanProperty var author: Author,
@BeanProperty var orcid: OrcidAuthor,
@BeanProperty var `type`: String
)
case class MatchData(
@BeanProperty var id: String,
@BeanProperty var graph_authors: java.util.List[Author],
@BeanProperty var orcid_authors: java.util.List[OrcidAuthor]
) {
def this() = this("null", null, null)
}
case class ORCIDAuthorEnricherResult( case class ORCIDAuthorEnricherResult(
@BeanProperty var id: String, @BeanProperty var id: String,
@BeanProperty var enriched_author: java.util.List[Author], @BeanProperty var enriched_author: java.util.List[Author],
@BeanProperty var author_matched: java.util.List[MatchedAuthors], @BeanProperty var author_matched: java.util.List[MatchedAuthors],
@BeanProperty var author_unmatched: java.util.List[Author], @BeanProperty var author_unmatched: java.util.List[Author],
@BeanProperty var orcid_unmatched: java.util.List[OrcidAutor] @BeanProperty var orcid_unmatched: java.util.List[OrcidAuthor]
) )
object ORCIDAuthorEnricher extends Serializable { object ORCIDAuthorEnricher extends Serializable {
@ -23,7 +46,7 @@ object ORCIDAuthorEnricher extends Serializable {
def enrichOrcid( def enrichOrcid(
id: String, id: String,
graph_authors: java.util.List[Author], graph_authors: java.util.List[Author],
orcid_authors: java.util.List[OrcidAutor] orcid_authors: java.util.List[OrcidAuthor]
): ORCIDAuthorEnricherResult = { ): ORCIDAuthorEnricherResult = {
// Author enriching strategy: // Author enriching strategy:
// 1) create a copy of graph author list in unmatched_authors // 1) create a copy of graph author list in unmatched_authors
@ -81,10 +104,10 @@ object ORCIDAuthorEnricher extends Serializable {
} }
private def extractAndEnrichMatches( private def extractAndEnrichMatches(
graph_authors: java.util.List[Author], graph_authors: java.util.List[Author],
orcid_authors: java.util.List[OrcidAutor], orcid_authors: java.util.List[OrcidAuthor],
matchingFunc: (Author, OrcidAutor) => Boolean, matchingFunc: (Author, OrcidAuthor) => Boolean,
matchName: String matchName: String
) = { ) = {
val matched = scala.collection.mutable.ArrayBuffer.empty[MatchedAuthors] val matched = scala.collection.mutable.ArrayBuffer.empty[MatchedAuthors]

View File

@ -1,10 +1,10 @@
package eu.dnetlib.dhp.enrich.orcid package eu.dnetlib.dhp.utils
import eu.dnetlib.pace.util.AuthorMatchers.matchOrderedTokenAndAbbreviations import eu.dnetlib.dhp.utils.AuthorMatchers.matchOrderedTokenAndAbbreviations
import org.junit.jupiter.api.Assertions.{assertFalse, assertTrue} import org.junit.jupiter.api.Assertions.{assertFalse, assertTrue}
import org.junit.jupiter.api.Test import org.junit.jupiter.api.Test
class ORCIDAuthorMatchersTest { class AuthorMatchersTest {
@Test def testShortNames(): Unit = { @Test def testShortNames(): Unit = {
assertTrue(matchOrderedTokenAndAbbreviations("Lasagni Mariozzi Federico", "Lasagni F. Mariozzi")) assertTrue(matchOrderedTokenAndAbbreviations("Lasagni Mariozzi Federico", "Lasagni F. Mariozzi"))

View File

@ -9,11 +9,11 @@ import java.util.stream.Collectors;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.dhp.utils.AuthorMatchers;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person; import eu.dnetlib.pace.model.Person;
import eu.dnetlib.pace.tree.support.AbstractListComparator; import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import eu.dnetlib.pace.util.AuthorMatchers;
@ComparatorClass("authorsMatch") @ComparatorClass("authorsMatch")
public class AuthorsMatch extends AbstractListComparator { public class AuthorsMatch extends AbstractListComparator {

View File

@ -2,38 +2,13 @@ package eu.dnetlib.dhp.enrich.orcid
import eu.dnetlib.dhp.application.AbstractScalaApplication import eu.dnetlib.dhp.application.AbstractScalaApplication
import eu.dnetlib.dhp.schema.common.ModelSupport import eu.dnetlib.dhp.schema.common.ModelSupport
import eu.dnetlib.dhp.schema.oaf._ import eu.dnetlib.dhp.utils.{MatchData, ORCIDAuthorEnricher, ORCIDAuthorEnricherResult}
import org.apache.spark.sql._ import org.apache.spark.sql._
import org.apache.spark.sql.functions._ import org.apache.spark.sql.functions._
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import scala.beans.BeanProperty
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
case class OrcidAutor(
@BeanProperty var orcid: String,
@BeanProperty var familyName: String,
@BeanProperty var givenName: String,
@BeanProperty var creditName: String,
@BeanProperty var otherNames: java.util.List[String]
) {
def this() = this("null", "null", "null", "null", null)
}
case class MatchData(
@BeanProperty var id: String,
@BeanProperty var graph_authors: java.util.List[Author],
@BeanProperty var orcid_authors: java.util.List[OrcidAutor]
) {
def this() = this("null", null, null)
}
case class MatchedAuthors(
@BeanProperty var author: Author,
@BeanProperty var orcid: OrcidAutor,
@BeanProperty var `type`: String
)
class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String], log: Logger) class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String], log: Logger)
extends AbstractScalaApplication(propertyPath, args, log: Logger) { extends AbstractScalaApplication(propertyPath, args, log: Logger) {
@ -87,7 +62,7 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
} }
private def createTemporaryData(graphPath: String, orcidPath: String, targetPath: String): Unit = { def createTemporaryData(graphPath: String, orcidPath: String, targetPath: String): Unit = {
val orcidAuthors = val orcidAuthors =
spark.read.load(s"$orcidPath/Authors").select("orcid", "familyName", "givenName", "creditName", "otherNames") spark.read.load(s"$orcidPath/Authors").select("orcid", "familyName", "givenName", "creditName", "otherNames")