Implemented ORCID Enrichment

2023-11-24 12:39:58 +01:00 · 2023-11-24 12:39:58 +01:00 · 34a4b3cbdf
parent 6ce36b3e41
commit 34a4b3cbdf
9 changed files with 696 additions and 40 deletions
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
@ -1,11 +1,18 @@
 package eu.dnetlib.dhp.oa.merge;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.text.Normalizer;
 import java.util.*;
 import java.util.function.Function;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.lang3.tuple.MutablePair;
 import org.apache.commons.lang3.tuple.Pair;
 import org.jetbrains.annotations.NotNull;
 import com.wcohen.ss.JaroWinkler;
@ -14,6 +21,28 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
 import eu.dnetlib.pace.model.Person;
 import scala.Tuple2;
 class SimilarityCellInfo implements Comparable<SimilarityCellInfo> {
 	public int authorPosition = 0;
 	public int orcidPosition = 0;
 	public double maxColumnSimilarity = 0.0;
 	public SimilarityCellInfo() {
 	}
 	public void setValues(final int authPos, final int orcidPos, final double similarity) {
 		this.authorPosition = authPos;
 		this.orcidPosition = orcidPos;
 		this.maxColumnSimilarity = similarity;
 	}
 	@Override
 	public int compareTo(@NotNull SimilarityCellInfo o) {
 		return Double.compare(maxColumnSimilarity, o.maxColumnSimilarity);
 	}
 }
 public class AuthorMerger {
 	private static final Double THRESHOLD = 0.95;
@ -119,6 +148,267 @@ public class AuthorMerger {
 				});
 	}
 	public static String normalizeFullName(final String fullname) {
 		return nfd(fullname)
 			.toLowerCase()
 			// do not compact the regexes in a single expression, would cause StackOverflowError
 			// in case
 			// of large input strings
 			.replaceAll("(\\W)+", " ")
 			.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
 			.replaceAll("(\\p{Punct})+", " ")
 			.replaceAll("(\\d)+", " ")
 			.replaceAll("(\\n)+", " ")
 			.trim();
 //        return Arrays.stream(fullname.split("[\\s | , | ;]+")).map(String::toLowerCase).sorted().collect(Collectors.joining());
 	}
 	private static String generateAuthorkey(final Author a) {
 		if (a.getSurname() == null)
 			return "NOSURNAME";
 		return normalize(a.getSurname());
 	}
 //
 //    public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {
 //        if (baseAuthor == null || baseAuthor.isEmpty())
 //            return orcidAuthor;
 //
 //        if (orcidAuthor == null || orcidAuthor.isEmpty())
 //            return baseAuthor;
 //
 //        if (baseAuthor.size() == 1 && orcidAuthor.size() > 10)
 //            return baseAuthor;
 //
 //
 //        Map<String, List<Author>> pubClusters = baseAuthor.stream().collect(Collectors.toMap(AuthorMerger::generateAuthorkey, Arrays::asList, (a, b) -> {
 //            a.addAll(b);
 //            return a;
 //        }));
 //
 //        Map<String, List<Author>> orcidClusters = baseAuthor.stream().collect(Collectors.toMap(AuthorMerger::generateAuthorkey, Arrays::asList, (a, b) -> {
 //            a.addAll(b);
 //            return a;
 //        }));
 //
 //        System.out.println(pubClusters.keySet().size());
 //        System.out.println(orcidClusters.keySet().size());
 //
 //
 //
 //
 //       return null;
 //
 //
 //    }
 	static int hammingDist(String str1, String str2) {
 		if (str1.length() != str2.length())
 			return Math.max(str1.length(), str2.length());
 		int i = 0, count = 0;
 		while (i < str1.length()) {
 			if (str1.charAt(i) != str2.charAt(i))
 				count++;
 			i++;
 		}
 		return count;
 	}
 	private static String authorFieldToBeCompared(Author author) {
 		if (StringUtils.isNotBlank(author.getSurname())) {
 			return author.getSurname();
 		}
 		if (StringUtils.isNotBlank(author.getFullname())) {
 			return author.getFullname();
 		}
 		return null;
 	}
 	public static boolean checkSimilarity3(final Author left, final Author right) {
 		if (StringUtils.isNotBlank(left.getSurname()) && StringUtils.isNotBlank(left.getName())
 			&&
 			StringUtils.isNotBlank(right.getSurname()) && StringUtils.isNotBlank(right.getName())
 		)
 			return left.getSurname().equalsIgnoreCase(right.getSurname())
 				&& left.getName().substring(0, 1).equalsIgnoreCase(right.getName().substring(0, 1));
 		final Person pl = parse(left);
 		final Person pr = parse(right);
 		// If one of them didn't have a surname the match is false
 		if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) &&
 			pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank)))
 			return false;
 		// The Authors have one surname in common
 		if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) {
 			// If one of them has only a surname and is the same we can say that they are the same author
 			if ((pl.getName() == null || pl.getName().stream().allMatch(StringUtils::isBlank)) ||
 				(pr.getName() == null || pr.getName().stream().allMatch(StringUtils::isBlank)))
 				return true;
 			// The authors have the same initials of Name in common
 			if (pl
 				.getName()
 				.stream()
 				.anyMatch(
 					nl -> pr
 						.getName()
 						.stream()
 						.anyMatch(nr -> nr.substring(0, 1).equalsIgnoreCase(nl.substring(0, 1)))))
 				return true;
 		}
 		return false;
 	}
 	public static boolean checkSimilarity2(final Author left, final Author right) {
 		final Person pl = parse(left);
 		final Person pr = parse(right);
 		// If one of them didn't have a surname the match is false
 		if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) &&
 			pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank)))
 			return false;
 		// The Authors have one surname in common
 		if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) {
 			// If one of them has only a surname and is the same we can say that they are the same author
 			if ((pl.getName() == null || pl.getName().stream().allMatch(StringUtils::isBlank)) ||
 				(pr.getName() == null || pr.getName().stream().allMatch(StringUtils::isBlank)))
 				return true;
 			// The authors have the same initials of Name in common
 			if (pl
 				.getName()
 				.stream()
 				.anyMatch(
 					nl -> pr
 						.getName()
 						.stream()
 						.anyMatch(nr -> nr.substring(0, 1).equalsIgnoreCase(nl.substring(0, 1)))))
 				return true;
 		}
 		return false;
 	}
 	public static boolean checkSimilarity(final Author left, final Author right) {
 		if (left.getSurname() == null && left.getFullname() == null)
 			return false;
 		if (right.getSurname() == null && right.getFullname() == null)
 			return false;
 		// The Authors have the same surname, or we are tolerant from 1 different char(lets say 1 Typo)
 		if (StringUtils.isNotBlank(left.getSurname()) && StringUtils.isNotBlank(right.getSurname())) {
 			if (left.getSurname().equalsIgnoreCase(right.getSurname())
 				|| hammingDist(left.getSurname().toLowerCase(), right.getSurname().toLowerCase()) < 2) {
 				// IN case on of the two Authors has no given Name the match is true
 				if (StringUtils.isBlank(left.getName()) || StringUtils.isBlank(right.getName()))
 					return true;
 				// If the surname is correct, and they have the same name or the name starts with the same Letter we can
 				// say is the same author
 				if (left.getName().equalsIgnoreCase(right.getName())
 					|| left.getName().substring(0, 1).equalsIgnoreCase(right.getName().substring(0, 1)))
 					return true;
 			}
 			// Different SURNAME
 			else {
 				return false;
 			}
 		} else {
 			// This is the case where the two authors have or the surname or the fullname
 			// get the first not null of the surname or fullname of both
 			final String l = authorFieldToBeCompared(left);
 			final String r = authorFieldToBeCompared(right);
 			if (l == null || r == null)
 				return false;
 			// The same length means they are the same field
 			if (l.length() == r.length()) {
 				return normalize(l).equals(normalize(r));
 			}
 			// In this case probably l contains the surname and r contains the fullname
 			if (l.length() < r.length())
 				return normalize(r).contains(normalize(l));
 			// In this case probably l contains the fullname and r contains the surname
 			return normalize(l).contains(normalize(r));
 		}
 		return false;
 	}
 	public static List<Author> enrichOrcid2(List<Author> baseAuthor, List<Author> orcidAuthor) {
 		final Integer match_itm = 0;
 		if (baseAuthor == null || baseAuthor.isEmpty())
 			return orcidAuthor;
 		if (orcidAuthor == null || orcidAuthor.isEmpty())
 			return baseAuthor;
 		if (baseAuthor.size() == 1 && orcidAuthor.size() > 10)
 			return baseAuthor;
 		final List<Author> oAuthor = new ArrayList<>();
 		oAuthor.addAll(orcidAuthor);
 		baseAuthor.forEach(ba -> {
 			Optional<Author> aMatch = oAuthor.stream().filter(oa -> checkSimilarity2(ba, oa)).findFirst();
 			if (aMatch.isPresent()) {
 				final Author sameAuthor = aMatch.get();
 				addPid(ba, sameAuthor.getPid());
 				oAuthor.remove(sameAuthor);
 			}
 		});
 		return baseAuthor;
 	}
 	public static List<Author> enrichOrcid(List<Author> baseAuthor, List<Author> orcidAuthor) {
 		if (baseAuthor == null || baseAuthor.isEmpty())
 			return orcidAuthor;
 		if (orcidAuthor == null || orcidAuthor.isEmpty())
 			return baseAuthor;
 		if (baseAuthor.size() == 1 && orcidAuthor.size() > 10)
 			return baseAuthor;
 		final Double similarityMatrix[][] = new Double[baseAuthor.size()][orcidAuthor.size()];
 		final List<SimilarityCellInfo> maxColums = new ArrayList<>();
 		for (int i = 0; i < orcidAuthor.size(); i++)
 			maxColums.add(new SimilarityCellInfo());
 		for (int i = 0; i < baseAuthor.size(); i++) {
 			for (int j = 0; j < orcidAuthor.size(); j++) {
 				similarityMatrix[i][j] = sim(baseAuthor.get(i), orcidAuthor.get(j));
 				if (maxColums.get(j).maxColumnSimilarity < similarityMatrix[i][j])
 					maxColums.get(j).setValues(i, j, similarityMatrix[i][j]);
 			}
 		}
 		maxColums
 			.stream()
 			.sorted()
 			.filter(si -> si.maxColumnSimilarity > 0.85)
 			.forEach(si -> addPid(baseAuthor.get(si.authorPosition), orcidAuthor.get(si.orcidPosition).getPid()));
 		return baseAuthor;
 	}
 	private static void addPid(final Author a, final List<StructuredProperty> pids) {
 		if (a.getPid() == null) {
 			a.setPid(new ArrayList<>());
 		}
 		a.getPid().addAll(pids);
 	}
 	public static String pidToComparableString(StructuredProperty pid) {
 		final String classid = pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase()
 			: "";
@ -171,7 +461,7 @@ public class AuthorMerger {
 		}
 	}
-	private static String normalize(final String s) {
+	public static String normalize(final String s) {
 		String[] normalized = nfd(s)
 			.toLowerCase()
 			// do not compact the regexes in a single expression, would cause StackOverflowError
--- a/dhp-common/src/test/java/eu/dnetlib/oa/merge/AuthorMergerTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/oa/merge/AuthorMergerTest.java
@ -0,0 +1,125 @@
 package eu.dnetlib.oa.merge;
 import static org.junit.jupiter.api.Assertions.*;
 import java.io.BufferedReader;
 import java.io.FileReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.util.Arrays;
 import java.util.List;
 import java.util.stream.Collectors;
 import org.junit.jupiter.api.Test;
 import org.junit.platform.commons.util.StringUtils;
 import com.fasterxml.jackson.core.type.TypeReference;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.oa.merge.AuthorMerger;
 import eu.dnetlib.dhp.schema.oaf.Author;
 public class AuthorMergerTest {
 	@Test
 	public void testNormalization() {
 		assertEquals("bruzzolasandro", AuthorMerger.normalizeFullName("Sandro, La Bruzzo"));
 		assertEquals("baglionimiriam", AuthorMerger.normalizeFullName("Miriam Baglioni"));
 		assertEquals("baglionimiriam", AuthorMerger.normalizeFullName("Miriam ;Baglioni,"));
 	}
 	public void testEnrcichAuthor() throws Exception {
 		final ObjectMapper mapper = new ObjectMapper();
 		BufferedReader pr = new BufferedReader(new InputStreamReader(
 			AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_publication.json")));
 		BufferedReader or = new BufferedReader(new InputStreamReader(
 			AuthorMergerTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/merge/authors_orcid.json")));
 		TypeReference<List<Author>> aclass = new TypeReference<List<Author>>() {
 		};
 		String pubLine;
 		int i = 0;
 		while ((pubLine = pr.readLine()) != null) {
 			final String pubId = pubLine;
 			final String MatchPidOrcid = or.readLine();
 			final String pubOrcid = or.readLine();
 			final String data = pr.readLine();
 			if (StringUtils.isNotBlank(data)) {
 				List<Author> publicationAuthors = mapper.readValue(data, aclass);
 				List<Author> orcidAuthors = mapper.readValue(or.readLine(), aclass);
 				System.out.printf("OAF ID = %s \n", pubId);
 				System.out.printf("ORCID Intersected ID = %s \n", pubOrcid);
 				System.out.printf("OAF Author Size = %d \n", publicationAuthors.size());
 				System.out.printf("Oricd Author Size = %d \n", orcidAuthors.size());
 				System.out.printf("Oricd Matched PID = %s \n", MatchPidOrcid);
 				long originalAuthorWithPiD = publicationAuthors
 					.stream()
 					.filter(
 						a -> a.getPid() != null && a
 							.getPid()
 							.stream()
 							.anyMatch(
 								p -> p.getQualifier() != null
 									&& p.getQualifier().getClassid().toLowerCase().contains("orcid")))
 					.count();
 				long start = System.currentTimeMillis();
 //                final List<Author> enrichedList = AuthorMerger.enrichOrcid(publicationAuthors, orcidAuthors);
 				final List<Author> enrichedList = AuthorMerger.enrichOrcid2(publicationAuthors, orcidAuthors);
 				long enrichedAuthorWithPid = enrichedList
 					.stream()
 					.filter(
 						a -> a.getPid() != null && a
 							.getPid()
 							.stream()
 							.anyMatch(
 								p -> p.getQualifier() != null
 									&& p.getQualifier().getClassid().toLowerCase().contains("orcid")))
 					.count();
 				long totalTime = (System.currentTimeMillis() - start) / 1000;
 				System.out
 					.printf(
 						"Enriched authors in %d seconds from %d pid to %d pid \n", totalTime, originalAuthorWithPiD,
 						enrichedAuthorWithPid);
 				System.out.println("=================");
 				if (++i > 30)
 					break;
 			}
 		}
 	}
 	@Test
 	public void checkSimilarityTest() {
 		final Author left = new Author();
 		left.setSurname("Wu");
 		left.setName("M.");
 		left.setFullname("Wu, M.");
 		System.out.println(AuthorMerger.normalizeFullName(left.getFullname()));
 		final Author right = new Author();
 		right.setName("Xin");
 		right.setSurname("Wu");
 		right.setFullname("Xin Wu");
 //        System.out.println(AuthorMerger.normalize(right.getFullname()));
 		boolean same = AuthorMerger.checkSimilarity2(left, right);
 		assertFalse(same);
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/orcid/DownloadORCIDTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/orcid/DownloadORCIDTest.java
@ -32,45 +32,6 @@ import eu.dnetlib.dhp.parser.utility.VtdException;
 public class DownloadORCIDTest {
 	private final Logger log = LoggerFactory.getLogger(DownloadORCIDTest.class);
 //	public void test() throws Exception {
 //
 //		Configuration conf = new Configuration();
 //		// Set FileSystem URI
 ////        conf.set("fs.defaultFS", "file://");
 //		// Because of Maven
 //		conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
 //		conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
 //
 //		System.setProperty("hadoop.home.dir", "file:///Users/sandro/orcid/");
 //
 //		final FileSystem fileSystem = FileSystem.get(conf);
 //
 //		new ExtractORCIDDump(fileSystem).run("/Users/sandro/orcid/", "/Users/sandro/orcid/extracted");
 //
 ////		final GZIPInputStream gzip = new GZIPInputStream(Files.newInputStream(Paths.get("/Users/sandro/orcid/ORCID_2023_10_activities_1.tar.gz")));
 ////		try(final TarArchiveInputStream tais = new TarArchiveInputStream(gzip)) {
 ////
 ////			TarArchiveEntry entry;
 ////			while ((entry = tais.getNextTarEntry()) != null) {
 ////
 ////				if (entry.isFile() && entry.getName().contains("employments")) {
 ////
 ////					System.out.println(entry.getName());
 ////					final String [] items = entry.getName().split("/");
 ////
 ////					final String res = IOUtils.toString(new BufferedReader(new InputStreamReader(tais)));
 ////					System.out.println("res = " + res);
 ////
 ////					System.out.println(items[items.length-2]);
 ////					break;
 ////				}
 ////
 ////
 ////			}
 ////		}
 //
 //	}
 	@Test
 	public void testSummary() throws Exception {
 		final String xml = IOUtils
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/enrich_graph_orcid_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/enrich_graph_orcid_parameters.json
@ -0,0 +1,26 @@
 [
  {
    "paramName": "mt",
    "paramLongName": "master",
    "paramDescription": "should be local or yarn",
    "paramRequired": true
  },
  {
    "paramName": "op",
    "paramLongName": "orcidPath",
    "paramDescription": "the path of the orcid Table generated by the dump",
    "paramRequired": true
  },
  {
    "paramName": "gp",
    "paramLongName": "graphPath",
    "paramDescription": "the path of the graph we want to apply enrichment",
    "paramRequired": true
  },
  {
    "paramName": "tp",
    "paramLongName": "targetPath",
    "paramDescription": "the output path of the graph enriched",
    "paramRequired": true
  }
 ]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/oozie_app/config-default.xml
@ -0,0 +1,34 @@
 <configuration>
    <property>
        <name>jobTracker</name>
        <value>yarnRM</value>
    </property>
    <property>
        <name>nameNode</name>
        <value>hdfs://nameservice1</value>
    </property>
    <property>
        <name>oozie.use.system.libpath</name>
        <value>true</value>
    </property>
    <property>
        <name>oozie.action.sharelib.for.spark</name>
        <value>spark2</value>
    </property>
    <property>
        <name>hiveMetastoreUris</name>
        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
    </property>
    <property>
        <name>hiveJdbcUrl</name>
        <value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
    </property>
    <property>
        <name>hiveDbName</name>
        <value>openaire</value>
    </property>
    <property>
        <name>oozie.launcher.mapreduce.user.classpath.first</name>
        <value>true</value>
    </property>
 </configuration>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/oozie_app/workflow.xml
@ -0,0 +1,52 @@
    <workflow-app name="Enrich_graph_with_ORCID_Workflow" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>orcidPath</name>
            <description>the path of the orcid Table generated by the dump</description>
        </property>
        <property>
            <name>graphPath</name>
            <description>the path of the graph we want to apply enrichment</description>
        </property>
        <property>
            <name>targetPath</name>
            <description>the output path of the graph enriched</description>
        </property>
    </parameters>
    <start to="EnrichGraph"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="EnrichGraph">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Enrich Graph with ORCID</name>
            <class>eu.dnetlib.dhp.enrich.orcid.SparkEnrichGraphWithOrcidAuthors</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=2g
                --conf spark.sql.shuffle.partitions=3000
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--orcidPath</arg><arg>${orcidPath}</arg>
            <arg>--targetPath</arg><arg>${targetPath}</arg>
            <arg>--graphPath</arg><arg>${graphPath}/publication</arg>
            <arg>--master</arg><arg>yarn</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/AuthorEnricher.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/AuthorEnricher.scala
@ -0,0 +1,37 @@
 package eu.dnetlib.dhp.enrich.orcid
 import eu.dnetlib.dhp.schema.oaf.{Author, Publication}
 import eu.dnetlib.dhp.schema.sx.OafUtils
 import org.apache.spark.sql.Row
 import scala.collection.JavaConverters._
 object AuthorEnricher extends Serializable {
  def createAuthor(givenName: String, familyName: String, orcid: String): Author = {
    val a = new Author
    a.setName(givenName)
    a.setSurname(familyName)
    a.setFullname(s"$givenName $familyName")
    a.setPid(List(OafUtils.createSP(orcid, "ORCID", "ORCID")).asJava)
    a
  }
  def toOAFAuthor(r: Row): java.util.List[Author] = {
    r.getList[Row](1)
      .asScala
      .map(s => createAuthor(s.getAs[String]("givenName"), s.getAs[String]("familyName"), s.getAs[String]("orcid")))
      .toList
      .asJava
  }
 //  def enrichAuthor(p:Publication,r:Row): Unit = {
 //    val k:Map[String, OAuthor] =r.getList[Row](1).asScala.map(s => (s.getAs[String]("orcid"), OAuthor(s.getAs[String]("givenName") ,s.getAs[String]("familyName") ))).groupBy(_._1).mapValues(_.map(_._2).head)
 //    println(k)
 //
 //
 //
 //  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala
@ -0,0 +1,119 @@
 package eu.dnetlib.dhp.enrich.orcid
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.application.AbstractScalaApplication
 import eu.dnetlib.dhp.oa.merge.AuthorMerger
 import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, Publication, StructuredProperty}
 import org.apache.spark.sql.{Dataset, Encoder, Encoders, Row, SaveMode, SparkSession}
 import org.apache.spark.sql.functions.{col, collect_set, concat, explode, expr, first, flatten, lower, size, struct}
 import org.slf4j.{Logger, LoggerFactory}
 import org.apache.spark.sql.types._
 class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String], log: Logger)
    extends AbstractScalaApplication(propertyPath, args, log: Logger) {
  /** Here all the spark applications runs this method
    * where the whole logic of the spark node is defined
    */
  override def run(): Unit = {
    val graphPath = parser.get("graphPath")
    log.info(s"graphPath is '$graphPath'")
    val orcidPath = parser.get("orcidPath")
    log.info(s"orcidPath is '$orcidPath'")
    val targetPath = parser.get("targetPath")
    log.info(s"targetPath is '$targetPath'")
    enrichResult(spark, graphPath, orcidPath, targetPath)
  }
  def enrichResult(spark: SparkSession, graphPath: String, orcidPath: String, outputPath: String): Unit = {
    val orcidPublication = generateOrcidTable(spark, orcidPath)
    implicit val publicationEncoder = Encoders.bean(classOf[Publication])
    val aschema = new StructType()
      .add("id", StringType)
      .add("dataInfo", Encoders.bean(classOf[DataInfo]).schema)
      .add(
        "author",Encoders.bean(classOf[Author]).schema
      )
    val schema = new StructType()
      .add("id", StringType)
      .add("dataInfo", Encoders.bean(classOf[DataInfo]).schema)
      .add(
        "instance",
        ArrayType(new StructType().add("pid", ArrayType(Encoders.bean(classOf[StructuredProperty]).schema)))
      )
    val entities = spark.read
      .schema(schema)
      .json(graphPath)
      .where("datainfo.deletedbyinference = false")
      .drop("datainfo")
      .withColumn("instances", explode(col("instance")))
      .withColumn("pids", explode(col("instances.pid")))
      .select(
        col("pids.qualifier.classid").alias("pid_schema"),
        col("pids.value").alias("pid_value"),
        col("id").alias("dnet_id")
      )
    val orcidDnet = orcidPublication
      .join(
        entities,
        lower(col("schema")).equalTo(lower(col("pid_schema"))) &&
        lower(col("value")).equalTo(lower(col("pid_value"))),
        "inner"
      )
      .groupBy(col("dnet_id"))
      .agg(collect_set(orcidPublication("author")).alias("orcid_authors"))
      .select("dnet_id", "orcid_authors")
      .cache()
    val publication = spark.read.schema(publicationEncoder.schema).json(graphPath).as[Publication]
    publication
      .joinWith(orcidDnet, publication("id").equalTo(orcidDnet("dnet_id")), "left")
      .map {
        case (p: Publication, null) => {
          p
        }
        case (p: Publication, r: Row) =>
          p.setAuthor(AuthorMerger.enrichOrcid2(p.getAuthor, AuthorEnricher.toOAFAuthor(r)))
          p
      }
      .write
      .mode(SaveMode.Overwrite)
      .option("compression", "gzip")
      .json(outputPath)
  }
  def generateOrcidTable(spark: SparkSession, inputPath: String): Dataset[Row] = {
    val orcidAuthors =
      spark.read.load(s"$inputPath/Authors").select("orcid", "familyName", "givenName", "creditName", "otherNames")
    val orcidWorks = spark.read
      .load(s"$inputPath/Works")
      .select(col("orcid"), explode(col("pids")).alias("identifier"))
      .where(
        "identifier.schema = 'doi' or identifier.schema ='pmid' or identifier.schema ='pmc' or identifier.schema ='arxiv' or identifier.schema ='handle'"
      )
    orcidAuthors
      .join(orcidWorks, orcidAuthors("orcid").equalTo(orcidWorks("orcid")))
      .select(
        col("identifier.schema").alias("schema"),
        col("identifier.value").alias("value"),
        struct(orcidAuthors("orcid").alias("orcid"), col("givenName"), col("familyName")).alias("author")
      )
  }
 }
 object SparkEnrichGraphWithOrcidAuthors {
  val log: Logger = LoggerFactory.getLogger(SparkEnrichGraphWithOrcidAuthors.getClass)
  def main(args: Array[String]): Unit = {
    new SparkEnrichGraphWithOrcidAuthors("/eu/dnetlib/dhp/enrich/orcid/enrich_graph_orcid_parameters.json", args, log)
      .initialize()
      .run()
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/EnrichOrcidTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/EnrichOrcidTest.scala
@ -0,0 +1,12 @@
 package eu.dnetlib.dhp.enrich.orcid
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.SparkSession
 import org.junit.jupiter.api.Test
 import org.slf4j.{Logger, LoggerFactory}
 class EnrichOrcidTest {
  val log: Logger = LoggerFactory.getLogger(getClass)
 }