Move AuthorMatchers in dhp-common
This commit is contained in:
parent
26cdc7e439
commit
d67f125614
|
@ -1,4 +1,4 @@
|
|||
package eu.dnetlib.pace.util
|
||||
package eu.dnetlib.dhp.utils
|
||||
|
||||
import java.util.Locale
|
||||
import java.util.regex.Pattern
|
||||
|
@ -66,19 +66,18 @@ object AuthorMatchers {
|
|||
}
|
||||
|
||||
def removeMatches(
|
||||
graph_authors: java.util.List[String],
|
||||
orcid_authors: java.util.List[String],
|
||||
matchingFunc: java.util.function.BiFunction[String,String,Boolean]
|
||||
) : java.util.List[String] = {
|
||||
removeMatches(graph_authors, orcid_authors, (a, b) => matchingFunc(a,b))
|
||||
graph_authors: java.util.List[String],
|
||||
orcid_authors: java.util.List[String],
|
||||
matchingFunc: java.util.function.BiFunction[String, String, Boolean]
|
||||
): java.util.List[String] = {
|
||||
removeMatches(graph_authors, orcid_authors, (a, b) => matchingFunc(a, b))
|
||||
}
|
||||
|
||||
|
||||
def removeMatches(
|
||||
graph_authors: java.util.List[String],
|
||||
orcid_authors: java.util.List[String],
|
||||
matchingFunc: (String, String) => Boolean
|
||||
) : java.util.List[String] = {
|
||||
graph_authors: java.util.List[String],
|
||||
orcid_authors: java.util.List[String],
|
||||
matchingFunc: (String, String) => Boolean
|
||||
): java.util.List[String] = {
|
||||
val matched = new java.util.ArrayList[String]()
|
||||
|
||||
if (graph_authors != null && !graph_authors.isEmpty) {
|
|
@ -1,21 +1,44 @@
|
|||
package eu.dnetlib.dhp.enrich.orcid
|
||||
package eu.dnetlib.dhp.utils
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.{Author, StructuredProperty}
|
||||
import eu.dnetlib.dhp.schema.sx.OafUtils
|
||||
import eu.dnetlib.pace.util.AuthorMatchers
|
||||
|
||||
import java.util
|
||||
import scala.beans.BeanProperty
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.util.control.Breaks.{break, breakable}
|
||||
|
||||
case class OrcidAuthor(
|
||||
@BeanProperty var orcid: String,
|
||||
@BeanProperty var familyName: String,
|
||||
@BeanProperty var givenName: String,
|
||||
@BeanProperty var creditName: String,
|
||||
@BeanProperty var otherNames: java.util.List[String]
|
||||
) {
|
||||
def this() = this("null", "null", "null", "null", null)
|
||||
}
|
||||
|
||||
case class MatchedAuthors(
|
||||
@BeanProperty var author: Author,
|
||||
@BeanProperty var orcid: OrcidAuthor,
|
||||
@BeanProperty var `type`: String
|
||||
)
|
||||
|
||||
case class MatchData(
|
||||
@BeanProperty var id: String,
|
||||
@BeanProperty var graph_authors: java.util.List[Author],
|
||||
@BeanProperty var orcid_authors: java.util.List[OrcidAuthor]
|
||||
) {
|
||||
def this() = this("null", null, null)
|
||||
}
|
||||
|
||||
case class ORCIDAuthorEnricherResult(
|
||||
@BeanProperty var id: String,
|
||||
@BeanProperty var enriched_author: java.util.List[Author],
|
||||
@BeanProperty var author_matched: java.util.List[MatchedAuthors],
|
||||
@BeanProperty var author_unmatched: java.util.List[Author],
|
||||
@BeanProperty var orcid_unmatched: java.util.List[OrcidAutor]
|
||||
@BeanProperty var orcid_unmatched: java.util.List[OrcidAuthor]
|
||||
)
|
||||
|
||||
object ORCIDAuthorEnricher extends Serializable {
|
||||
|
@ -23,7 +46,7 @@ object ORCIDAuthorEnricher extends Serializable {
|
|||
def enrichOrcid(
|
||||
id: String,
|
||||
graph_authors: java.util.List[Author],
|
||||
orcid_authors: java.util.List[OrcidAutor]
|
||||
orcid_authors: java.util.List[OrcidAuthor]
|
||||
): ORCIDAuthorEnricherResult = {
|
||||
// Author enriching strategy:
|
||||
// 1) create a copy of graph author list in unmatched_authors
|
||||
|
@ -81,10 +104,10 @@ object ORCIDAuthorEnricher extends Serializable {
|
|||
}
|
||||
|
||||
private def extractAndEnrichMatches(
|
||||
graph_authors: java.util.List[Author],
|
||||
orcid_authors: java.util.List[OrcidAutor],
|
||||
matchingFunc: (Author, OrcidAutor) => Boolean,
|
||||
matchName: String
|
||||
graph_authors: java.util.List[Author],
|
||||
orcid_authors: java.util.List[OrcidAuthor],
|
||||
matchingFunc: (Author, OrcidAuthor) => Boolean,
|
||||
matchName: String
|
||||
) = {
|
||||
val matched = scala.collection.mutable.ArrayBuffer.empty[MatchedAuthors]
|
||||
|
|
@ -1,10 +1,10 @@
|
|||
package eu.dnetlib.dhp.enrich.orcid
|
||||
package eu.dnetlib.dhp.utils
|
||||
|
||||
import eu.dnetlib.pace.util.AuthorMatchers.matchOrderedTokenAndAbbreviations
|
||||
import eu.dnetlib.dhp.utils.AuthorMatchers.matchOrderedTokenAndAbbreviations
|
||||
import org.junit.jupiter.api.Assertions.{assertFalse, assertTrue}
|
||||
import org.junit.jupiter.api.Test
|
||||
|
||||
class ORCIDAuthorMatchersTest {
|
||||
class AuthorMatchersTest {
|
||||
|
||||
@Test def testShortNames(): Unit = {
|
||||
assertTrue(matchOrderedTokenAndAbbreviations("Lasagni Mariozzi Federico", "Lasagni F. Mariozzi"))
|
|
@ -9,11 +9,11 @@ import java.util.stream.Collectors;
|
|||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.dhp.utils.AuthorMatchers;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import eu.dnetlib.pace.util.AuthorMatchers;
|
||||
|
||||
@ComparatorClass("authorsMatch")
|
||||
public class AuthorsMatch extends AbstractListComparator {
|
||||
|
|
|
@ -2,38 +2,13 @@ package eu.dnetlib.dhp.enrich.orcid
|
|||
|
||||
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport
|
||||
import eu.dnetlib.dhp.schema.oaf._
|
||||
import eu.dnetlib.dhp.utils.{MatchData, ORCIDAuthorEnricher, ORCIDAuthorEnricherResult}
|
||||
import org.apache.spark.sql._
|
||||
import org.apache.spark.sql.functions._
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.beans.BeanProperty
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
case class OrcidAutor(
|
||||
@BeanProperty var orcid: String,
|
||||
@BeanProperty var familyName: String,
|
||||
@BeanProperty var givenName: String,
|
||||
@BeanProperty var creditName: String,
|
||||
@BeanProperty var otherNames: java.util.List[String]
|
||||
) {
|
||||
def this() = this("null", "null", "null", "null", null)
|
||||
}
|
||||
|
||||
case class MatchData(
|
||||
@BeanProperty var id: String,
|
||||
@BeanProperty var graph_authors: java.util.List[Author],
|
||||
@BeanProperty var orcid_authors: java.util.List[OrcidAutor]
|
||||
) {
|
||||
def this() = this("null", null, null)
|
||||
}
|
||||
|
||||
case class MatchedAuthors(
|
||||
@BeanProperty var author: Author,
|
||||
@BeanProperty var orcid: OrcidAutor,
|
||||
@BeanProperty var `type`: String
|
||||
)
|
||||
|
||||
class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String], log: Logger)
|
||||
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
||||
|
||||
|
@ -87,7 +62,7 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
|
|||
|
||||
}
|
||||
|
||||
private def createTemporaryData(graphPath: String, orcidPath: String, targetPath: String): Unit = {
|
||||
def createTemporaryData(graphPath: String, orcidPath: String, targetPath: String): Unit = {
|
||||
val orcidAuthors =
|
||||
spark.read.load(s"$orcidPath/Authors").select("orcid", "familyName", "givenName", "creditName", "otherNames")
|
||||
|
||||
|
|
Loading…
Reference in New Issue