Move AuthorMatchers in dhp-common

This commit is contained in:
Giambattista Bloisi 2024-10-30 15:23:05 +01:00
parent dccbcfd36c
commit 36ca0b123e
5 changed files with 47 additions and 50 deletions

View File

@ -1,4 +1,4 @@
package eu.dnetlib.pace.util
package eu.dnetlib.dhp.utils
import java.util.Locale
import java.util.regex.Pattern
@ -73,7 +73,6 @@ object AuthorMatchers {
removeMatches(graph_authors, orcid_authors, (a, b) => matchingFunc(a, b))
}
def removeMatches(
graph_authors: java.util.List[String],
orcid_authors: java.util.List[String],

View File

@ -1,21 +1,44 @@
package eu.dnetlib.dhp.enrich.orcid
package eu.dnetlib.dhp.utils
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.{Author, StructuredProperty}
import eu.dnetlib.dhp.schema.sx.OafUtils
import eu.dnetlib.pace.util.AuthorMatchers
import java.util
import scala.beans.BeanProperty
import scala.collection.JavaConverters._
import scala.util.control.Breaks.{break, breakable}
case class OrcidAuthor(
@BeanProperty var orcid: String,
@BeanProperty var familyName: String,
@BeanProperty var givenName: String,
@BeanProperty var creditName: String,
@BeanProperty var otherNames: java.util.List[String]
) {
def this() = this("null", "null", "null", "null", null)
}
case class MatchedAuthors(
@BeanProperty var author: Author,
@BeanProperty var orcid: OrcidAuthor,
@BeanProperty var `type`: String
)
case class MatchData(
@BeanProperty var id: String,
@BeanProperty var graph_authors: java.util.List[Author],
@BeanProperty var orcid_authors: java.util.List[OrcidAuthor]
) {
def this() = this("null", null, null)
}
case class ORCIDAuthorEnricherResult(
@BeanProperty var id: String,
@BeanProperty var enriched_author: java.util.List[Author],
@BeanProperty var author_matched: java.util.List[MatchedAuthors],
@BeanProperty var author_unmatched: java.util.List[Author],
@BeanProperty var orcid_unmatched: java.util.List[OrcidAutor]
@BeanProperty var orcid_unmatched: java.util.List[OrcidAuthor]
)
object ORCIDAuthorEnricher extends Serializable {
@ -23,7 +46,7 @@ object ORCIDAuthorEnricher extends Serializable {
def enrichOrcid(
id: String,
graph_authors: java.util.List[Author],
orcid_authors: java.util.List[OrcidAutor]
orcid_authors: java.util.List[OrcidAuthor]
): ORCIDAuthorEnricherResult = {
// Author enriching strategy:
// 1) create a copy of graph author list in unmatched_authors
@ -82,8 +105,8 @@ object ORCIDAuthorEnricher extends Serializable {
private def extractAndEnrichMatches(
graph_authors: java.util.List[Author],
orcid_authors: java.util.List[OrcidAutor],
matchingFunc: (Author, OrcidAutor) => Boolean,
orcid_authors: java.util.List[OrcidAuthor],
matchingFunc: (Author, OrcidAuthor) => Boolean,
matchName: String
) = {
val matched = scala.collection.mutable.ArrayBuffer.empty[MatchedAuthors]

View File

@ -1,10 +1,10 @@
package eu.dnetlib.dhp.enrich.orcid
package eu.dnetlib.dhp.utils
import eu.dnetlib.pace.util.AuthorMatchers.matchOrderedTokenAndAbbreviations
import eu.dnetlib.dhp.utils.AuthorMatchers.matchOrderedTokenAndAbbreviations
import org.junit.jupiter.api.Assertions.{assertFalse, assertTrue}
import org.junit.jupiter.api.Test
class ORCIDAuthorMatchersTest {
class AuthorMatchersTest {
@Test def testShortNames(): Unit = {
assertTrue(matchOrderedTokenAndAbbreviations("Lasagni Mariozzi Federico", "Lasagni F. Mariozzi"))

View File

@ -9,11 +9,11 @@ import java.util.stream.Collectors;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.dhp.utils.AuthorMatchers;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import eu.dnetlib.pace.util.AuthorMatchers;
@ComparatorClass("authorsMatch")
public class AuthorsMatch extends AbstractListComparator {

View File

@ -2,38 +2,13 @@ package eu.dnetlib.dhp.enrich.orcid
import eu.dnetlib.dhp.application.AbstractScalaApplication
import eu.dnetlib.dhp.schema.common.ModelSupport
import eu.dnetlib.dhp.schema.oaf._
import eu.dnetlib.dhp.utils.{MatchData, ORCIDAuthorEnricher, ORCIDAuthorEnricherResult}
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.slf4j.{Logger, LoggerFactory}
import scala.beans.BeanProperty
import scala.collection.JavaConverters._
case class OrcidAutor(
@BeanProperty var orcid: String,
@BeanProperty var familyName: String,
@BeanProperty var givenName: String,
@BeanProperty var creditName: String,
@BeanProperty var otherNames: java.util.List[String]
) {
def this() = this("null", "null", "null", "null", null)
}
case class MatchData(
@BeanProperty var id: String,
@BeanProperty var graph_authors: java.util.List[Author],
@BeanProperty var orcid_authors: java.util.List[OrcidAutor]
) {
def this() = this("null", null, null)
}
case class MatchedAuthors(
@BeanProperty var author: Author,
@BeanProperty var orcid: OrcidAutor,
@BeanProperty var `type`: String
)
class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String], log: Logger)
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
@ -87,7 +62,7 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
}
private def createTemporaryData(graphPath: String, orcidPath: String, targetPath: String): Unit = {
def createTemporaryData(graphPath: String, orcidPath: String, targetPath: String): Unit = {
val orcidAuthors =
spark.read.load(s"$orcidPath/Authors").select("orcid", "familyName", "givenName", "creditName", "otherNames")