1
0
Fork 0

Implemented ORCID Enrichment

This commit is contained in:
Sandro La Bruzzo 2023-11-24 12:39:58 +01:00
parent 6ce36b3e41
commit 34a4b3cbdf
9 changed files with 696 additions and 40 deletions

View File

@ -1,11 +1,18 @@
package eu.dnetlib.dhp.oa.merge; package eu.dnetlib.dhp.oa.merge;
import java.io.FileWriter;
import java.io.IOException;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.*; import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.MutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.jetbrains.annotations.NotNull;
import com.wcohen.ss.JaroWinkler; import com.wcohen.ss.JaroWinkler;
@ -14,6 +21,28 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.pace.model.Person; import eu.dnetlib.pace.model.Person;
import scala.Tuple2; import scala.Tuple2;
class SimilarityCellInfo implements Comparable<SimilarityCellInfo> {
public int authorPosition = 0;
public int orcidPosition = 0;
public double maxColumnSimilarity = 0.0;
public SimilarityCellInfo() {
}
public void setValues(final int authPos, final int orcidPos, final double similarity) {
this.authorPosition = authPos;
this.orcidPosition = orcidPos;
this.maxColumnSimilarity = similarity;
}
@Override
public int compareTo(@NotNull SimilarityCellInfo o) {
return Double.compare(maxColumnSimilarity, o.maxColumnSimilarity);
}
}
public class AuthorMerger { public class AuthorMerger {
private static final Double THRESHOLD = 0.95; private static final Double THRESHOLD = 0.95;
@ -119,6 +148,267 @@ public class AuthorMerger {
}); });
} }
public static String normalizeFullName(final String fullname) {
return nfd(fullname)
.toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError
// in case
// of large input strings
.replaceAll("(\\W)+", " ")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
// return Arrays.stream(fullname.split("[\\s | , | ;]+")).map(String::toLowerCase).sorted().collect(Collectors.joining());
}
private static String generateAuthorkey(final Author a) {
if (a.getSurname() == null)
return "NOSURNAME";
return normalize(a.getSurname());
}
//
// public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {
// if (baseAuthor == null || baseAuthor.isEmpty())
// return orcidAuthor;
//
// if (orcidAuthor == null || orcidAuthor.isEmpty())
// return baseAuthor;
//
// if (baseAuthor.size() == 1 && orcidAuthor.size() > 10)
// return baseAuthor;
//
//
// Map<String, List<Author>> pubClusters = baseAuthor.stream().collect(Collectors.toMap(AuthorMerger::generateAuthorkey, Arrays::asList, (a, b) -> {
// a.addAll(b);
// return a;
// }));
//
// Map<String, List<Author>> orcidClusters = baseAuthor.stream().collect(Collectors.toMap(AuthorMerger::generateAuthorkey, Arrays::asList, (a, b) -> {
// a.addAll(b);
// return a;
// }));
//
// System.out.println(pubClusters.keySet().size());
// System.out.println(orcidClusters.keySet().size());
//
//
//
//
// return null;
//
//
// }
static int hammingDist(String str1, String str2) {
if (str1.length() != str2.length())
return Math.max(str1.length(), str2.length());
int i = 0, count = 0;
while (i < str1.length()) {
if (str1.charAt(i) != str2.charAt(i))
count++;
i++;
}
return count;
}
private static String authorFieldToBeCompared(Author author) {
if (StringUtils.isNotBlank(author.getSurname())) {
return author.getSurname();
}
if (StringUtils.isNotBlank(author.getFullname())) {
return author.getFullname();
}
return null;
}
public static boolean checkSimilarity3(final Author left, final Author right) {
if (StringUtils.isNotBlank(left.getSurname()) && StringUtils.isNotBlank(left.getName())
&&
StringUtils.isNotBlank(right.getSurname()) && StringUtils.isNotBlank(right.getName())
)
return left.getSurname().equalsIgnoreCase(right.getSurname())
&& left.getName().substring(0, 1).equalsIgnoreCase(right.getName().substring(0, 1));
final Person pl = parse(left);
final Person pr = parse(right);
// If one of them didn't have a surname the match is false
if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) &&
pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank)))
return false;
// The Authors have one surname in common
if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) {
// If one of them has only a surname and is the same we can say that they are the same author
if ((pl.getName() == null || pl.getName().stream().allMatch(StringUtils::isBlank)) ||
(pr.getName() == null || pr.getName().stream().allMatch(StringUtils::isBlank)))
return true;
// The authors have the same initials of Name in common
if (pl
.getName()
.stream()
.anyMatch(
nl -> pr
.getName()
.stream()
.anyMatch(nr -> nr.substring(0, 1).equalsIgnoreCase(nl.substring(0, 1)))))
return true;
}
return false;
}
public static boolean checkSimilarity2(final Author left, final Author right) {
final Person pl = parse(left);
final Person pr = parse(right);
// If one of them didn't have a surname the match is false
if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) &&
pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank)))
return false;
// The Authors have one surname in common
if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) {
// If one of them has only a surname and is the same we can say that they are the same author
if ((pl.getName() == null || pl.getName().stream().allMatch(StringUtils::isBlank)) ||
(pr.getName() == null || pr.getName().stream().allMatch(StringUtils::isBlank)))
return true;
// The authors have the same initials of Name in common
if (pl
.getName()
.stream()
.anyMatch(
nl -> pr
.getName()
.stream()
.anyMatch(nr -> nr.substring(0, 1).equalsIgnoreCase(nl.substring(0, 1)))))
return true;
}
return false;
}
public static boolean checkSimilarity(final Author left, final Author right) {
if (left.getSurname() == null && left.getFullname() == null)
return false;
if (right.getSurname() == null && right.getFullname() == null)
return false;
// The Authors have the same surname, or we are tolerant from 1 different char(lets say 1 Typo)
if (StringUtils.isNotBlank(left.getSurname()) && StringUtils.isNotBlank(right.getSurname())) {
if (left.getSurname().equalsIgnoreCase(right.getSurname())
|| hammingDist(left.getSurname().toLowerCase(), right.getSurname().toLowerCase()) < 2) {
// IN case on of the two Authors has no given Name the match is true
if (StringUtils.isBlank(left.getName()) || StringUtils.isBlank(right.getName()))
return true;
// If the surname is correct, and they have the same name or the name starts with the same Letter we can
// say is the same author
if (left.getName().equalsIgnoreCase(right.getName())
|| left.getName().substring(0, 1).equalsIgnoreCase(right.getName().substring(0, 1)))
return true;
}
// Different SURNAME
else {
return false;
}
} else {
// This is the case where the two authors have or the surname or the fullname
// get the first not null of the surname or fullname of both
final String l = authorFieldToBeCompared(left);
final String r = authorFieldToBeCompared(right);
if (l == null || r == null)
return false;
// The same length means they are the same field
if (l.length() == r.length()) {
return normalize(l).equals(normalize(r));
}
// In this case probably l contains the surname and r contains the fullname
if (l.length() < r.length())
return normalize(r).contains(normalize(l));
// In this case probably l contains the fullname and r contains the surname
return normalize(l).contains(normalize(r));
}
return false;
}
public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {
final Integer match_itm = 0;
if (baseAuthor == null || baseAuthor.isEmpty())
return orcidAuthor;
if (orcidAuthor == null || orcidAuthor.isEmpty())
return baseAuthor;
if (baseAuthor.size() == 1 && orcidAuthor.size() > 10)
return baseAuthor;
final List<Author> oAuthor = new ArrayList<>();
oAuthor.addAll(orcidAuthor);
baseAuthor.forEach(ba -> {
Optional<Author> aMatch = oAuthor.stream().filter(oa -> checkSimilarity2(ba, oa)).findFirst();
if (aMatch.isPresent()) {
final Author sameAuthor = aMatch.get();
addPid(ba, sameAuthor.getPid());
oAuthor.remove(sameAuthor);
}
});
return baseAuthor;
}
public static List<Author> enrichOrcid(List<Author> baseAuthor, List<Author> orcidAuthor) {
if (baseAuthor == null || baseAuthor.isEmpty())
return orcidAuthor;
if (orcidAuthor == null || orcidAuthor.isEmpty())
return baseAuthor;
if (baseAuthor.size() == 1 && orcidAuthor.size() > 10)
return baseAuthor;
final Double similarityMatrix[][] = new Double[baseAuthor.size()][orcidAuthor.size()];
final List<SimilarityCellInfo> maxColums = new ArrayList<>();
for (int i = 0; i < orcidAuthor.size(); i++)
maxColums.add(new SimilarityCellInfo());
for (int i = 0; i < baseAuthor.size(); i++) {
for (int j = 0; j < orcidAuthor.size(); j++) {
similarityMatrix[i][j] = sim(baseAuthor.get(i), orcidAuthor.get(j));
if (maxColums.get(j).maxColumnSimilarity < similarityMatrix[i][j])
maxColums.get(j).setValues(i, j, similarityMatrix[i][j]);
}
}
maxColums
.stream()
.sorted()
.filter(si -> si.maxColumnSimilarity > 0.85)
.forEach(si -> addPid(baseAuthor.get(si.authorPosition), orcidAuthor.get(si.orcidPosition).getPid()));
return baseAuthor;
}
private static void addPid(final Author a, final List<StructuredProperty> pids) {
if (a.getPid() == null) {
a.setPid(new ArrayList<>());
}
a.getPid().addAll(pids);
}
public static String pidToComparableString(StructuredProperty pid) { public static String pidToComparableString(StructuredProperty pid) {
final String classid = pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() final String classid = pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase()
: ""; : "";
@ -171,7 +461,7 @@ public class AuthorMerger {
} }
} }
private static String normalize(final String s) { public static String normalize(final String s) {
String[] normalized = nfd(s) String[] normalized = nfd(s)
.toLowerCase() .toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError // do not compact the regexes in a single expression, would cause StackOverflowError

View File

@ -0,0 +1,125 @@
package eu.dnetlib.oa.merge;
import static org.junit.jupiter.api.Assertions.*;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import org.junit.jupiter.api.Test;
import org.junit.platform.commons.util.StringUtils;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.merge.AuthorMerger;
import eu.dnetlib.dhp.schema.oaf.Author;
public class AuthorMergerTest {
@Test
public void testNormalization() {
assertEquals("bruzzolasandro", AuthorMerger.normalizeFullName("Sandro, La Bruzzo"));
assertEquals("baglionimiriam", AuthorMerger.normalizeFullName("Miriam Baglioni"));
assertEquals("baglionimiriam", AuthorMerger.normalizeFullName("Miriam ;Baglioni,"));
}
public void testEnrcichAuthor() throws Exception {
final ObjectMapper mapper = new ObjectMapper();
BufferedReader pr = new BufferedReader(new InputStreamReader(
AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_publication.json")));
BufferedReader or = new BufferedReader(new InputStreamReader(
AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_orcid.json")));
TypeReference<List<Author>> aclass = new TypeReference<List<Author>>() {
};
String pubLine;
int i = 0;
while ((pubLine = pr.readLine()) != null) {
final String pubId = pubLine;
final String MatchPidOrcid = or.readLine();
final String pubOrcid = or.readLine();
final String data = pr.readLine();
if (StringUtils.isNotBlank(data)) {
List<Author> publicationAuthors = mapper.readValue(data, aclass);
List<Author> orcidAuthors = mapper.readValue(or.readLine(), aclass);
System.out.printf("OAF ID = %s \n", pubId);
System.out.printf("ORCID Intersected ID = %s \n", pubOrcid);
System.out.printf("OAF Author Size = %d \n", publicationAuthors.size());
System.out.printf("Oricd Author Size = %d \n", orcidAuthors.size());
System.out.printf("Oricd Matched PID = %s \n", MatchPidOrcid);
long originalAuthorWithPiD = publicationAuthors
.stream()
.filter(
a -> a.getPid() != null && a
.getPid()
.stream()
.anyMatch(
p -> p.getQualifier() != null
&& p.getQualifier().getClassid().toLowerCase().contains("orcid")))
.count();
long start = System.currentTimeMillis();
// final List<Author> enrichedList = AuthorMerger.enrichOrcid(publicationAuthors, orcidAuthors);
final List<Author> enrichedList = AuthorMerger.enrichOrcid2(publicationAuthors, orcidAuthors);
long enrichedAuthorWithPid = enrichedList
.stream()
.filter(
a -> a.getPid() != null && a
.getPid()
.stream()
.anyMatch(
p -> p.getQualifier() != null
&& p.getQualifier().getClassid().toLowerCase().contains("orcid")))
.count();
long totalTime = (System.currentTimeMillis() - start) / 1000;
System.out
.printf(
"Enriched authors in %d seconds from %d pid to %d pid \n", totalTime, originalAuthorWithPiD,
enrichedAuthorWithPid);
System.out.println("=================");
if (++i > 30)
break;
}
}
}
@Test
public void checkSimilarityTest() {
final Author left = new Author();
left.setSurname("Wu");
left.setName("M.");
left.setFullname("Wu, M.");
System.out.println(AuthorMerger.normalizeFullName(left.getFullname()));
final Author right = new Author();
right.setName("Xin");
right.setSurname("Wu");
right.setFullname("Xin Wu");
// System.out.println(AuthorMerger.normalize(right.getFullname()));
boolean same = AuthorMerger.checkSimilarity2(left, right);
assertFalse(same);
}
}

View File

@ -32,45 +32,6 @@ import eu.dnetlib.dhp.parser.utility.VtdException;
public class DownloadORCIDTest { public class DownloadORCIDTest {
private final Logger log = LoggerFactory.getLogger(DownloadORCIDTest.class); private final Logger log = LoggerFactory.getLogger(DownloadORCIDTest.class);
// public void test() throws Exception {
//
// Configuration conf = new Configuration();
// // Set FileSystem URI
//// conf.set("fs.defaultFS", "file://");
// // Because of Maven
// conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
// conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
//
// System.setProperty("hadoop.home.dir", "file:///Users/sandro/orcid/");
//
// final FileSystem fileSystem = FileSystem.get(conf);
//
// new ExtractORCIDDump(fileSystem).run("/Users/sandro/orcid/", "/Users/sandro/orcid/extracted");
//
//// final GZIPInputStream gzip = new GZIPInputStream(Files.newInputStream(Paths.get("/Users/sandro/orcid/ORCID_2023_10_activities_1.tar.gz")));
//// try(final TarArchiveInputStream tais = new TarArchiveInputStream(gzip)) {
////
//// TarArchiveEntry entry;
//// while ((entry = tais.getNextTarEntry()) != null) {
////
//// if (entry.isFile() && entry.getName().contains("employments")) {
////
//// System.out.println(entry.getName());
//// final String [] items = entry.getName().split("/");
////
//// final String res = IOUtils.toString(new BufferedReader(new InputStreamReader(tais)));
//// System.out.println("res = " + res);
////
//// System.out.println(items[items.length-2]);
//// break;
//// }
////
////
//// }
//// }
//
// }
@Test @Test
public void testSummary() throws Exception { public void testSummary() throws Exception {
final String xml = IOUtils final String xml = IOUtils

View File

@ -0,0 +1,26 @@
[
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "op",
"paramLongName": "orcidPath",
"paramDescription": "the path of the orcid Table generated by the dump",
"paramRequired": true
},
{
"paramName": "gp",
"paramLongName": "graphPath",
"paramDescription": "the path of the graph we want to apply enrichment",
"paramRequired": true
},
{
"paramName": "tp",
"paramLongName": "targetPath",
"paramDescription": "the output path of the graph enriched",
"paramRequired": true
}
]

View File

@ -0,0 +1,34 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hiveJdbcUrl</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
</property>
<property>
<name>hiveDbName</name>
<value>openaire</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -0,0 +1,52 @@
<workflow-app name="Enrich_graph_with_ORCID_Workflow" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>orcidPath</name>
<description>the path of the orcid Table generated by the dump</description>
</property>
<property>
<name>graphPath</name>
<description>the path of the graph we want to apply enrichment</description>
</property>
<property>
<name>targetPath</name>
<description>the output path of the graph enriched</description>
</property>
</parameters>
<start to="EnrichGraph"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="EnrichGraph">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Enrich Graph with ORCID</name>
<class>eu.dnetlib.dhp.enrich.orcid.SparkEnrichGraphWithOrcidAuthors</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=2g
--conf spark.sql.shuffle.partitions=3000
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--orcidPath</arg><arg>${orcidPath}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
<arg>--graphPath</arg><arg>${graphPath}/publication</arg>
<arg>--master</arg><arg>yarn</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,37 @@
package eu.dnetlib.dhp.enrich.orcid
import eu.dnetlib.dhp.schema.oaf.{Author, Publication}
import eu.dnetlib.dhp.schema.sx.OafUtils
import org.apache.spark.sql.Row
import scala.collection.JavaConverters._
object AuthorEnricher extends Serializable {
def createAuthor(givenName: String, familyName: String, orcid: String): Author = {
val a = new Author
a.setName(givenName)
a.setSurname(familyName)
a.setFullname(s"$givenName $familyName")
a.setPid(List(OafUtils.createSP(orcid, "ORCID", "ORCID")).asJava)
a
}
def toOAFAuthor(r: Row): java.util.List[Author] = {
r.getList[Row](1)
.asScala
.map(s => createAuthor(s.getAs[String]("givenName"), s.getAs[String]("familyName"), s.getAs[String]("orcid")))
.toList
.asJava
}
// def enrichAuthor(p:Publication,r:Row): Unit = {
// val k:Map[String, OAuthor] =r.getList[Row](1).asScala.map(s => (s.getAs[String]("orcid"), OAuthor(s.getAs[String]("givenName") ,s.getAs[String]("familyName") ))).groupBy(_._1).mapValues(_.map(_._2).head)
// println(k)
//
//
//
// }
}

View File

@ -0,0 +1,119 @@
package eu.dnetlib.dhp.enrich.orcid
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.AbstractScalaApplication
import eu.dnetlib.dhp.oa.merge.AuthorMerger
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, Publication, StructuredProperty}
import org.apache.spark.sql.{Dataset, Encoder, Encoders, Row, SaveMode, SparkSession}
import org.apache.spark.sql.functions.{col, collect_set, concat, explode, expr, first, flatten, lower, size, struct}
import org.slf4j.{Logger, LoggerFactory}
import org.apache.spark.sql.types._
class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String], log: Logger)
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
/** Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
override def run(): Unit = {
val graphPath = parser.get("graphPath")
log.info(s"graphPath is '$graphPath'")
val orcidPath = parser.get("orcidPath")
log.info(s"orcidPath is '$orcidPath'")
val targetPath = parser.get("targetPath")
log.info(s"targetPath is '$targetPath'")
enrichResult(spark, graphPath, orcidPath, targetPath)
}
def enrichResult(spark: SparkSession, graphPath: String, orcidPath: String, outputPath: String): Unit = {
val orcidPublication = generateOrcidTable(spark, orcidPath)
implicit val publicationEncoder = Encoders.bean(classOf[Publication])
val aschema = new StructType()
.add("id", StringType)
.add("dataInfo", Encoders.bean(classOf[DataInfo]).schema)
.add(
"author",Encoders.bean(classOf[Author]).schema
)
val schema = new StructType()
.add("id", StringType)
.add("dataInfo", Encoders.bean(classOf[DataInfo]).schema)
.add(
"instance",
ArrayType(new StructType().add("pid", ArrayType(Encoders.bean(classOf[StructuredProperty]).schema)))
)
val entities = spark.read
.schema(schema)
.json(graphPath)
.where("datainfo.deletedbyinference = false")
.drop("datainfo")
.withColumn("instances", explode(col("instance")))
.withColumn("pids", explode(col("instances.pid")))
.select(
col("pids.qualifier.classid").alias("pid_schema"),
col("pids.value").alias("pid_value"),
col("id").alias("dnet_id")
)
val orcidDnet = orcidPublication
.join(
entities,
lower(col("schema")).equalTo(lower(col("pid_schema"))) &&
lower(col("value")).equalTo(lower(col("pid_value"))),
"inner"
)
.groupBy(col("dnet_id"))
.agg(collect_set(orcidPublication("author")).alias("orcid_authors"))
.select("dnet_id", "orcid_authors")
.cache()
val publication = spark.read.schema(publicationEncoder.schema).json(graphPath).as[Publication]
publication
.joinWith(orcidDnet, publication("id").equalTo(orcidDnet("dnet_id")), "left")
.map {
case (p: Publication, null) => {
p
}
case (p: Publication, r: Row) =>
p.setAuthor(AuthorMerger.enrichOrcid2(p.getAuthor, AuthorEnricher.toOAFAuthor(r)))
p
}
.write
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath)
}
def generateOrcidTable(spark: SparkSession, inputPath: String): Dataset[Row] = {
val orcidAuthors =
spark.read.load(s"$inputPath/Authors").select("orcid", "familyName", "givenName", "creditName", "otherNames")
val orcidWorks = spark.read
.load(s"$inputPath/Works")
.select(col("orcid"), explode(col("pids")).alias("identifier"))
.where(
"identifier.schema = 'doi' or identifier.schema ='pmid' or identifier.schema ='pmc' or identifier.schema ='arxiv' or identifier.schema ='handle'"
)
orcidAuthors
.join(orcidWorks, orcidAuthors("orcid").equalTo(orcidWorks("orcid")))
.select(
col("identifier.schema").alias("schema"),
col("identifier.value").alias("value"),
struct(orcidAuthors("orcid").alias("orcid"), col("givenName"), col("familyName")).alias("author")
)
}
}
object SparkEnrichGraphWithOrcidAuthors {
val log: Logger = LoggerFactory.getLogger(SparkEnrichGraphWithOrcidAuthors.getClass)
def main(args: Array[String]): Unit = {
new SparkEnrichGraphWithOrcidAuthors("/eu/dnetlib/dhp/enrich/orcid/enrich_graph_orcid_parameters.json", args, log)
.initialize()
.run()
}
}

View File

@ -0,0 +1,12 @@
package eu.dnetlib.dhp.enrich.orcid
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.junit.jupiter.api.Test
import org.slf4j.{Logger, LoggerFactory}
class EnrichOrcidTest {
val log: Logger = LoggerFactory.getLogger(getClass)
}