Move AuthorMatchers in dhp-common
This commit is contained in:
parent
dccbcfd36c
commit
36ca0b123e
|
@ -1,4 +1,4 @@
|
||||||
package eu.dnetlib.pace.util
|
package eu.dnetlib.dhp.utils
|
||||||
|
|
||||||
import java.util.Locale
|
import java.util.Locale
|
||||||
import java.util.regex.Pattern
|
import java.util.regex.Pattern
|
||||||
|
@ -66,19 +66,18 @@ object AuthorMatchers {
|
||||||
}
|
}
|
||||||
|
|
||||||
def removeMatches(
|
def removeMatches(
|
||||||
graph_authors: java.util.List[String],
|
graph_authors: java.util.List[String],
|
||||||
orcid_authors: java.util.List[String],
|
orcid_authors: java.util.List[String],
|
||||||
matchingFunc: java.util.function.BiFunction[String,String,Boolean]
|
matchingFunc: java.util.function.BiFunction[String, String, Boolean]
|
||||||
) : java.util.List[String] = {
|
): java.util.List[String] = {
|
||||||
removeMatches(graph_authors, orcid_authors, (a, b) => matchingFunc(a,b))
|
removeMatches(graph_authors, orcid_authors, (a, b) => matchingFunc(a, b))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def removeMatches(
|
def removeMatches(
|
||||||
graph_authors: java.util.List[String],
|
graph_authors: java.util.List[String],
|
||||||
orcid_authors: java.util.List[String],
|
orcid_authors: java.util.List[String],
|
||||||
matchingFunc: (String, String) => Boolean
|
matchingFunc: (String, String) => Boolean
|
||||||
) : java.util.List[String] = {
|
): java.util.List[String] = {
|
||||||
val matched = new java.util.ArrayList[String]()
|
val matched = new java.util.ArrayList[String]()
|
||||||
|
|
||||||
if (graph_authors != null && !graph_authors.isEmpty) {
|
if (graph_authors != null && !graph_authors.isEmpty) {
|
|
@ -1,21 +1,44 @@
|
||||||
package eu.dnetlib.dhp.enrich.orcid
|
package eu.dnetlib.dhp.utils
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Author, StructuredProperty}
|
import eu.dnetlib.dhp.schema.oaf.{Author, StructuredProperty}
|
||||||
import eu.dnetlib.dhp.schema.sx.OafUtils
|
import eu.dnetlib.dhp.schema.sx.OafUtils
|
||||||
import eu.dnetlib.pace.util.AuthorMatchers
|
|
||||||
|
|
||||||
import java.util
|
import java.util
|
||||||
import scala.beans.BeanProperty
|
import scala.beans.BeanProperty
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
import scala.util.control.Breaks.{break, breakable}
|
import scala.util.control.Breaks.{break, breakable}
|
||||||
|
|
||||||
|
case class OrcidAuthor(
|
||||||
|
@BeanProperty var orcid: String,
|
||||||
|
@BeanProperty var familyName: String,
|
||||||
|
@BeanProperty var givenName: String,
|
||||||
|
@BeanProperty var creditName: String,
|
||||||
|
@BeanProperty var otherNames: java.util.List[String]
|
||||||
|
) {
|
||||||
|
def this() = this("null", "null", "null", "null", null)
|
||||||
|
}
|
||||||
|
|
||||||
|
case class MatchedAuthors(
|
||||||
|
@BeanProperty var author: Author,
|
||||||
|
@BeanProperty var orcid: OrcidAuthor,
|
||||||
|
@BeanProperty var `type`: String
|
||||||
|
)
|
||||||
|
|
||||||
|
case class MatchData(
|
||||||
|
@BeanProperty var id: String,
|
||||||
|
@BeanProperty var graph_authors: java.util.List[Author],
|
||||||
|
@BeanProperty var orcid_authors: java.util.List[OrcidAuthor]
|
||||||
|
) {
|
||||||
|
def this() = this("null", null, null)
|
||||||
|
}
|
||||||
|
|
||||||
case class ORCIDAuthorEnricherResult(
|
case class ORCIDAuthorEnricherResult(
|
||||||
@BeanProperty var id: String,
|
@BeanProperty var id: String,
|
||||||
@BeanProperty var enriched_author: java.util.List[Author],
|
@BeanProperty var enriched_author: java.util.List[Author],
|
||||||
@BeanProperty var author_matched: java.util.List[MatchedAuthors],
|
@BeanProperty var author_matched: java.util.List[MatchedAuthors],
|
||||||
@BeanProperty var author_unmatched: java.util.List[Author],
|
@BeanProperty var author_unmatched: java.util.List[Author],
|
||||||
@BeanProperty var orcid_unmatched: java.util.List[OrcidAutor]
|
@BeanProperty var orcid_unmatched: java.util.List[OrcidAuthor]
|
||||||
)
|
)
|
||||||
|
|
||||||
object ORCIDAuthorEnricher extends Serializable {
|
object ORCIDAuthorEnricher extends Serializable {
|
||||||
|
@ -23,7 +46,7 @@ object ORCIDAuthorEnricher extends Serializable {
|
||||||
def enrichOrcid(
|
def enrichOrcid(
|
||||||
id: String,
|
id: String,
|
||||||
graph_authors: java.util.List[Author],
|
graph_authors: java.util.List[Author],
|
||||||
orcid_authors: java.util.List[OrcidAutor]
|
orcid_authors: java.util.List[OrcidAuthor]
|
||||||
): ORCIDAuthorEnricherResult = {
|
): ORCIDAuthorEnricherResult = {
|
||||||
// Author enriching strategy:
|
// Author enriching strategy:
|
||||||
// 1) create a copy of graph author list in unmatched_authors
|
// 1) create a copy of graph author list in unmatched_authors
|
||||||
|
@ -81,10 +104,10 @@ object ORCIDAuthorEnricher extends Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
private def extractAndEnrichMatches(
|
private def extractAndEnrichMatches(
|
||||||
graph_authors: java.util.List[Author],
|
graph_authors: java.util.List[Author],
|
||||||
orcid_authors: java.util.List[OrcidAutor],
|
orcid_authors: java.util.List[OrcidAuthor],
|
||||||
matchingFunc: (Author, OrcidAutor) => Boolean,
|
matchingFunc: (Author, OrcidAuthor) => Boolean,
|
||||||
matchName: String
|
matchName: String
|
||||||
) = {
|
) = {
|
||||||
val matched = scala.collection.mutable.ArrayBuffer.empty[MatchedAuthors]
|
val matched = scala.collection.mutable.ArrayBuffer.empty[MatchedAuthors]
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
package eu.dnetlib.dhp.enrich.orcid
|
package eu.dnetlib.dhp.utils
|
||||||
|
|
||||||
import eu.dnetlib.pace.util.AuthorMatchers.matchOrderedTokenAndAbbreviations
|
import eu.dnetlib.dhp.utils.AuthorMatchers.matchOrderedTokenAndAbbreviations
|
||||||
import org.junit.jupiter.api.Assertions.{assertFalse, assertTrue}
|
import org.junit.jupiter.api.Assertions.{assertFalse, assertTrue}
|
||||||
import org.junit.jupiter.api.Test
|
import org.junit.jupiter.api.Test
|
||||||
|
|
||||||
class ORCIDAuthorMatchersTest {
|
class AuthorMatchersTest {
|
||||||
|
|
||||||
@Test def testShortNames(): Unit = {
|
@Test def testShortNames(): Unit = {
|
||||||
assertTrue(matchOrderedTokenAndAbbreviations("Lasagni Mariozzi Federico", "Lasagni F. Mariozzi"))
|
assertTrue(matchOrderedTokenAndAbbreviations("Lasagni Mariozzi Federico", "Lasagni F. Mariozzi"))
|
|
@ -9,11 +9,11 @@ import java.util.stream.Collectors;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.utils.AuthorMatchers;
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Person;
|
import eu.dnetlib.pace.model.Person;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
import eu.dnetlib.pace.util.AuthorMatchers;
|
|
||||||
|
|
||||||
@ComparatorClass("authorsMatch")
|
@ComparatorClass("authorsMatch")
|
||||||
public class AuthorsMatch extends AbstractListComparator {
|
public class AuthorsMatch extends AbstractListComparator {
|
||||||
|
|
|
@ -2,38 +2,13 @@ package eu.dnetlib.dhp.enrich.orcid
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport
|
import eu.dnetlib.dhp.schema.common.ModelSupport
|
||||||
import eu.dnetlib.dhp.schema.oaf._
|
import eu.dnetlib.dhp.utils.{MatchData, ORCIDAuthorEnricher, ORCIDAuthorEnricherResult}
|
||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql._
|
||||||
import org.apache.spark.sql.functions._
|
import org.apache.spark.sql.functions._
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
import scala.beans.BeanProperty
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
|
||||||
case class OrcidAutor(
|
|
||||||
@BeanProperty var orcid: String,
|
|
||||||
@BeanProperty var familyName: String,
|
|
||||||
@BeanProperty var givenName: String,
|
|
||||||
@BeanProperty var creditName: String,
|
|
||||||
@BeanProperty var otherNames: java.util.List[String]
|
|
||||||
) {
|
|
||||||
def this() = this("null", "null", "null", "null", null)
|
|
||||||
}
|
|
||||||
|
|
||||||
case class MatchData(
|
|
||||||
@BeanProperty var id: String,
|
|
||||||
@BeanProperty var graph_authors: java.util.List[Author],
|
|
||||||
@BeanProperty var orcid_authors: java.util.List[OrcidAutor]
|
|
||||||
) {
|
|
||||||
def this() = this("null", null, null)
|
|
||||||
}
|
|
||||||
|
|
||||||
case class MatchedAuthors(
|
|
||||||
@BeanProperty var author: Author,
|
|
||||||
@BeanProperty var orcid: OrcidAutor,
|
|
||||||
@BeanProperty var `type`: String
|
|
||||||
)
|
|
||||||
|
|
||||||
class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String], log: Logger)
|
class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String], log: Logger)
|
||||||
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
||||||
|
|
||||||
|
@ -87,7 +62,7 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private def createTemporaryData(graphPath: String, orcidPath: String, targetPath: String): Unit = {
|
def createTemporaryData(graphPath: String, orcidPath: String, targetPath: String): Unit = {
|
||||||
val orcidAuthors =
|
val orcidAuthors =
|
||||||
spark.read.load(s"$orcidPath/Authors").select("orcid", "familyName", "givenName", "creditName", "otherNames")
|
spark.read.load(s"$orcidPath/Authors").select("orcid", "familyName", "givenName", "creditName", "otherNames")
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue