ORCID Enrichment and Download #364

Merged
claudio.atzori merged 14 commits from orcid_import into beta 2023-12-01 15:05:45 +01:00
2 changed files with 6 additions and 13 deletions
Showing only changes of commit bf0fd27c36 - Show all commits

View File

@ -132,17 +132,6 @@ public class AuthorMerger {
.trim(); .trim();
} }
static int hammingDist(String str1, String str2) {
if (str1.length() != str2.length())
return Math.max(str1.length(), str2.length());
int i = 0, count = 0;
while (i < str1.length()) {
if (str1.charAt(i) != str2.charAt(i))
count++;
i++;
}
return count;
}
private static String authorFieldToBeCompared(Author author) { private static String authorFieldToBeCompared(Author author) {
if (StringUtils.isNotBlank(author.getSurname())) { if (StringUtils.isNotBlank(author.getSurname())) {

Out of context this seems to account too much difference for strings of different length.
What about ensure to take as reference for while loop the shorter string, prefill count with the length difference and then add the char-by-char comparison difference?
That would be more permissive about strings that have the very same prefix.

Out of context this seems to account too much difference for strings of different length. What about ensure to take as reference for while loop the shorter string, prefill count with the length difference and then add the char-by-char comparison difference? That would be more permissive about strings that have the very same prefix.
Review

About hammingDist function is never used, was a test of previous comparing function so I delete it

About hammingDist function is never used, was a test of previous comparing function so I delete it

View File

@ -2,10 +2,12 @@ package eu.dnetlib.dhp.enrich.orcid
import eu.dnetlib.dhp.application.AbstractScalaApplication import eu.dnetlib.dhp.application.AbstractScalaApplication
import eu.dnetlib.dhp.oa.merge.AuthorMerger import eu.dnetlib.dhp.oa.merge.AuthorMerger
import eu.dnetlib.dhp.schema.common.ModelSupport
import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Result, Software} import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Result, Software}
import org.apache.spark.sql.functions._ import org.apache.spark.sql.functions._
import org.apache.spark.sql._ import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String], log: Logger) class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String], log: Logger)
extends AbstractScalaApplication(propertyPath, args, log: Logger) { extends AbstractScalaApplication(propertyPath, args, log: Logger) {
@ -21,6 +23,8 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
val targetPath = parser.get("targetPath") val targetPath = parser.get("targetPath")
log.info(s"targetPath is '$targetPath'") log.info(s"targetPath is '$targetPath'")
val orcidPublication: Dataset[Row] = generateOrcidTable(spark, orcidPath) val orcidPublication: Dataset[Row] = generateOrcidTable(spark, orcidPath)
// ModelSupport.entityTypes.entrySet().asScala.filter(k => k.getKey.getClass isInstance(Result))
enrichResult( enrichResult(
spark, spark,
s"$graphPath/publication", s"$graphPath/publication",
@ -63,7 +67,7 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
.schema(enc.schema) .schema(enc.schema)
.json(graphPath) .json(graphPath)
.select(col("id"), col("datainfo"), col("instance")) .select(col("id"), col("datainfo"), col("instance"))
.where("datainfo.deletedbyinference = false") .where("datainfo.deletedbyinference != true")
.drop("datainfo") .drop("datainfo")
.withColumn("instances", explode(col("instance"))) .withColumn("instances", explode(col("instance")))
.withColumn("pids", explode(col("instances.pid"))) .withColumn("pids", explode(col("instances.pid")))
@ -109,7 +113,7 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
.load(s"$inputPath/Works") .load(s"$inputPath/Works")
.select(col("orcid"), explode(col("pids")).alias("identifier")) .select(col("orcid"), explode(col("pids")).alias("identifier"))
.where( .where(
"identifier.schema = 'doi' or identifier.schema ='pmid' or identifier.schema ='pmc' or identifier.schema ='arxiv' or identifier.schema ='handle'" "identifier.schema IN('doi','pmid','pmc','arxiv','handle')"
) )
val orcidPublication = orcidAuthors val orcidPublication = orcidAuthors
.join(orcidWorks, orcidAuthors("orcid").equalTo(orcidWorks("orcid"))) .join(orcidWorks, orcidAuthors("orcid").equalTo(orcidWorks("orcid")))