added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork)

2020-06-25 18:43:29 +02:00 · 2020-06-25 18:43:29 +02:00 · d6498278ed
parent fcbb4c1489
commit d6498278ed
14 changed files with 1125 additions and 231 deletions
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java
@ -19,7 +19,7 @@ import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.hadoop.io.compress.CompressionCodecFactory;
 import org.mortbay.log.Log;

-import eu.dnetlib.doiboost.orcid.json.JsonWriter;
+import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
 import eu.dnetlib.doiboost.orcid.model.WorkData;
 import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;

--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java
@ -19,7 +19,7 @@ import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.hadoop.io.compress.CompressionCodecFactory;
 import org.mortbay.log.Log;

-import eu.dnetlib.doiboost.orcid.json.JsonWriter;
+import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
 import eu.dnetlib.doiboost.orcid.model.AuthorData;
 import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;

--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java
@ -0,0 +1,16 @@
+
+package eu.dnetlib.doiboost.orcid.json;
+
+import com.google.gson.Gson;
+import com.google.gson.JsonObject;
+import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
+
+public class JsonHelper {
+
+	public static String createOidWork(WorkDataNoDoi workData) {
+		JsonObject oidWork = new JsonObject();
+		oidWork.addProperty("oid", workData.getOid());
+		oidWork.addProperty("work", new Gson().toJson(workData));
+		return oidWork.toString();
+	}
+}
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java
@ -0,0 +1,149 @@
+
+package eu.dnetlib.doiboost.orcidnodoi;
+
+import eu.dnetlib.doiboost.orcid.json.JsonHelper;
+import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
+import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
+import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.CompressionCodecFactory;
+import org.mortbay.log.Log;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.URI;
+
+public class ActivitiesDumpReader {
+
+	private static final int MAX_XML_WORKS_PARSED = -1;
+	private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000;
+
+	public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath)
+		throws Exception {
+		String uri = inputUri;
+		FileSystem fs = FileSystem.get(URI.create(uri), conf);
+		Path inputPath = new Path(uri);
+		CompressionCodecFactory factory = new CompressionCodecFactory(conf);
+		CompressionCodec codec = factory.getCodec(inputPath);
+		if (codec == null) {
+			System.err.println("No codec found for " + uri);
+			System.exit(1);
+		}
+		CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
+		InputStream gzipInputStream = null;
+		try {
+			gzipInputStream = codec.createInputStream(fs.open(inputPath));
+			parseTarActivities(fs, conf, gzipInputStream, outputPath);
+
+		} finally {
+			Log.debug("Closing gzip stream");
+			IOUtils.closeStream(gzipInputStream);
+		}
+	}
+
+	private static void parseTarActivities(
+		FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) {
+		int counter = 0;
+		int noDoiFound = 0;
+		int errorFromOrcidFound = 0;
+		int xmlParserErrorFound = 0;
+		try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
+			TarArchiveEntry entry = null;
+
+			try (SequenceFile.Writer writer = SequenceFile
+				.createWriter(
+					conf,
+					SequenceFile.Writer.file(outputPath),
+					SequenceFile.Writer.keyClass(Text.class),
+					SequenceFile.Writer.valueClass(Text.class))) {
+				while ((entry = tais.getNextTarEntry()) != null) {
+					String filename = entry.getName();
+
+					try {
+						if (entry.isDirectory() || !filename.contains("works")) {
+
+						} else {
+							Log.debug("XML work entry name: " + entry.getName());
+							counter++;
+							BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from
+																									// tarInput
+							String line;
+							StringBuffer buffer = new StringBuffer();
+							while ((line = br.readLine()) != null) {
+								buffer.append(line);
+							}
+							WorkDataNoDoi workDataNoDoi = XMLRecordParserNoDoi.VTDParseWorkData(buffer.toString().getBytes());
+							if (workDataNoDoi != null) {
+								if (workDataNoDoi.getErrorCode() != null) {
+									errorFromOrcidFound += 1;
+									Log
+										.debug(
+											"error from Orcid with code "
+												+ workDataNoDoi.getErrorCode()
+												+ " for entry "
+												+ entry.getName());
+									continue;
+								}
+								boolean isDoiFound = workDataNoDoi.getExtIds().stream()
+										.filter(e -> e.getType()!=null)
+										.anyMatch(e -> e.getType().equals("doi"));
+								if (!isDoiFound) {
+									String jsonData = JsonHelper.createOidWork(workDataNoDoi);
+									Log.debug("oid: " + workDataNoDoi.getOid() + " data: " + jsonData);
+
+									final Text key = new Text(workDataNoDoi.getOid());
+									final Text value = new Text(jsonData);
+
+									try {
+										writer.append(key, value);
+									} catch (IOException e) {
+										Log.debug("Writing to sequence file: " + e.getMessage());
+										Log.debug(e);
+										throw new RuntimeException(e);
+									}
+									noDoiFound += 1;
+								}
+
+							} else {
+								Log.warn("Data not retrievable [" + entry.getName() + "] " + buffer.toString());
+								xmlParserErrorFound += 1;
+							}
+						}
+					} catch (Exception e) {
+						Log
+							.warn(
+								"Parsing work from tar archive and xml work: " + filename + "  " + e.getMessage());
+						Log.warn(e);
+					}
+
+					if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) {
+						Log.info("Current xml works parsed: " + counter);
+					}
+
+					if ((MAX_XML_WORKS_PARSED > -1) && (counter > MAX_XML_WORKS_PARSED)) {
+						break;
+					}
+				}
+			}
+		} catch (IOException e) {
+			Log.warn("Parsing work from gzip archive: " + e.getMessage());
+			Log.warn(e);
+			throw new RuntimeException(e);
+		}
+		Log.info("Activities parse completed");
+		Log.info("Total XML works parsed: " + counter);
+		Log.info("Total no doi work found: " + noDoiFound);
+		Log.info("Error from Orcid found: " + errorFromOrcidFound);
+		Log.info("Error parsing xml work found: " + xmlParserErrorFound);
+	}
+}
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java
@ -0,0 +1,52 @@
+
+package eu.dnetlib.doiboost.orcidnodoi;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.doiboost.orcid.OrcidDSManager;
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.mortbay.log.Log;
+
+import java.io.IOException;
+
+public class GenOrcidAuthorWork extends OrcidDSManager {
+
+	private String activitiesFileNameTarGz;
+	private String outputWorksPath;
+	private String workingPath;
+
+	public static void main(String[] args) throws IOException, Exception {
+		GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork();
+		genOrcidAuthorWork.loadArgs(args);
+		genOrcidAuthorWork.generateAuthorsDOIsData();
+	}
+
+	public void generateAuthorsDOIsData() throws Exception {
+		Configuration conf = initConfigurationObject();
+		FileSystem fs = initFileSystemObject(conf);
+		String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz);
+		Path outputPath = new Path(hdfsServerUri.concat(workingPath).concat(outputWorksPath));
+		ActivitiesDumpReader.parseGzActivities(conf, tarGzUri, outputPath);
+	}
+
+	private void loadArgs(String[] args) throws IOException, Exception {
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+			IOUtils
+				.toString(
+					GenOrcidAuthorWork.class
+						.getResourceAsStream(
+							"/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json")));
+		parser.parseArgument(args);
+
+		hdfsServerUri = parser.get("hdfsServerUri");
+		Log.info("HDFS URI: " + hdfsServerUri);
+		workingPath = parser.get("workingPath");
+		Log.info("Working Path: " + workingPath);
+		activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
+		Log.info("Activities File Name: " + activitiesFileNameTarGz);
+		outputWorksPath = parser.get("outputWorksPath");
+		Log.info("Output Author Work Data: " + outputWorksPath);
+	}
+}
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java
@ -0,0 +1,119 @@
+
+package eu.dnetlib.doiboost.orcidnodoi;
+
+import com.google.gson.Gson;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonParser;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.doiboost.orcid.model.AuthorData;
+import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
+import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import scala.Tuple2;
+
+import java.io.IOException;
+import java.util.Objects;
+import java.util.Optional;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+public class SparkGenEnrichedOrcidWorks {
+
+	public static void main(String[] args) throws IOException, Exception {
+		Logger logger = LoggerFactory.getLogger(SparkGenEnrichedOrcidWorks.class);
+		logger.info("[ SparkGenerateDoiAuthorList STARTED]");
+
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+			IOUtils
+				.toString(
+					SparkGenEnrichedOrcidWorks.class
+						.getResourceAsStream(
+							"/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json")));
+		parser.parseArgument(args);
+		Boolean isSparkSessionManaged = Optional
+			.ofNullable(parser.get("isSparkSessionManaged"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.TRUE);
+		logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+		final String workingPath = parser.get("workingPath");
+		logger.info("workingPath: ", workingPath);
+		final String outputEnrichedWorksPath = parser.get("outputEnrichedWorksPath");
+		logger.info("outputEnrichedWorksPath: ", outputEnrichedWorksPath);
+		final String outputWorksPath = parser.get("outputWorksPath");
+		logger.info("outputWorksPath: ", outputWorksPath);
+
+		SparkConf conf = new SparkConf();
+		runWithSparkSession(
+			conf,
+			isSparkSessionManaged,
+			spark -> {
+				JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
+
+				JavaPairRDD<Text, Text> summariesRDD = sc
+					.sequenceFile(workingPath + "../orcid_summaries/output/authors.seq", Text.class, Text.class);
+				Dataset<AuthorData> summariesDataset = spark
+					.createDataset(
+						summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
+						Encoders.bean(AuthorData.class));
+
+				JavaPairRDD<Text, Text> activitiesRDD = sc
+					.sequenceFile(workingPath + outputWorksPath + "works_X.seq" , Text.class, Text.class);
+				Dataset<WorkDataNoDoi> activitiesDataset = spark
+					.createDataset(
+						activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(),
+						Encoders.bean(WorkDataNoDoi.class));
+
+				activitiesDataset
+						.joinWith(
+								summariesDataset,
+								activitiesDataset.col("oid").equalTo(summariesDataset.col("oid")), "inner")
+						.map(
+								(MapFunction<Tuple2<WorkDataNoDoi, AuthorData>, Tuple2<String, WorkDataNoDoi>>) value -> {
+									WorkDataNoDoi w = value._1;
+									AuthorData a = value._2;
+									AuthorMatcher.match(a, w.getContributors());
+									return new Tuple2<>(a.getOid(), w);
+								},
+								Encoders.tuple(Encoders.STRING(), Encoders.bean(WorkDataNoDoi.class)))
+						.filter(Objects::nonNull)
+						.toJavaRDD()
+						.saveAsTextFile(workingPath + outputEnrichedWorksPath);;
+			});
+	}
+
+	private static AuthorData loadAuthorFromJson(Text orcidId, Text json) {
+		AuthorData authorData = new AuthorData();
+		authorData.setOid(orcidId.toString());
+		JsonElement jElement = new JsonParser().parse(json.toString());
+		authorData.setName(getJsonValue(jElement, "name"));
+		authorData.setSurname(getJsonValue(jElement, "surname"));
+		authorData.setCreditName(getJsonValue(jElement, "creditname"));
+		return authorData;
+	}
+
+	private static WorkDataNoDoi loadWorkFromJson(Text orcidId, Text json) {
+		WorkDataNoDoi workData = new Gson().fromJson(json.toString(), WorkDataNoDoi.class);
+		return workData;
+	}
+
+	private static String getJsonValue(JsonElement jElement, String property) {
+		if (jElement.getAsJsonObject().has(property)) {
+			JsonElement name = null;
+			name = jElement.getAsJsonObject().get(property);
+			if (name != null && !name.isJsonNull()) {
+				return name.getAsString();
+			}
+		}
+		return null;
+	}
+}
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java
@ -1,5 +1,5 @@

-package eu.dnetlib.doiboost.orcid.json;
+package eu.dnetlib.doiboost.orcidnodoi.json;

 import com.google.gson.JsonObject;

--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java
@ -8,9 +8,9 @@ import eu.dnetlib.doiboost.orcid.model.AuthorData;
 public class Contributor extends AuthorData implements Serializable {
 	private String sequence;
 	private String role;
-	private boolean simpleMatch = false;
-	private Double score = 0.0;
-	private boolean bestMatch = false;
+	private transient boolean simpleMatch = false;
+	private transient Double score = 0.0;
+	private transient boolean bestMatch = false;

 	public String getSequence() {
 		return sequence;
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java
@ -97,5 +97,4 @@ public class WorkDataNoDoi implements Serializable {
 	public void setContributors(List<Contributor> contributors) {
 		this.contributors = contributors;
 	}
-
 }
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java
@ -0,0 +1,204 @@
+
+package eu.dnetlib.doiboost.orcidnodoi.similarity;
+
+import java.io.IOException;
+import java.text.Normalizer;
+import java.util.*;
+
+import org.apache.commons.text.similarity.JaroWinklerSimilarity;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+import com.ximpleware.NavException;
+import com.ximpleware.ParseException;
+import com.ximpleware.XPathEvalException;
+import com.ximpleware.XPathParseException;
+
+import eu.dnetlib.dhp.parser.utility.VtdException;
+import eu.dnetlib.doiboost.orcid.model.AuthorData;
+import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
+import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
+
+public class AuthorMatcher {
+
+	private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class);
+	private static final Double threshold = 0.8;
+
+	public static void match(AuthorData author, List<Contributor> contributors)
+		throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
+
+		int matchCounter = 0;
+		List<Integer> matchCounters = Arrays.asList(matchCounter);
+		Contributor contributor = null;
+		contributors.forEach(c -> {
+			if (normalize(c.getCreditName()).contains(normalize(author.getName())) ||
+				normalize(c.getCreditName()).contains(normalize(author.getSurname())) ||
+				((author.getOtherName() != null)
+					&& normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) {
+				matchCounters.set(0, matchCounters.get(0) + 1);
+				c.setSimpleMatch(true);
+			}
+		});
+		logger.info("match counter: " + Integer.toString(matchCounters.get(0)));
+		if (matchCounters.get(0) == 1) {
+			updateAuthorsSimpleMatch(contributors, author);
+		} else if (matchCounters.get(0) > 1) {
+			Optional<Contributor> optCon = contributors
+				.stream()
+				.filter(c -> c.isSimpleMatch())
+				.map(c -> {
+					c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName()));
+					logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore());
+					return c;
+				})
+				.filter(c -> c.getScore() >= threshold)
+				.max(Comparator.comparing(c -> c.getScore()));
+			Contributor bestMatchContributor = null;
+			if (optCon.isPresent()) {
+				bestMatchContributor = optCon.get();
+				bestMatchContributor.setBestMatch(true);
+				logger.info("best match: " + bestMatchContributor.getCreditName());
+				updateAuthorsSimilarityMatch(contributors, author);
+			}
+
+		}
+
+		logger.info("UPDATED contributors: ");
+		contributors.forEach(c -> {
+			logger
+				.info(
+					c.getOid() + " - " + c.getCreditName() + " - " +
+						c.getName() + " - " + c.getSurname() + " - " +
+						c.getRole() + " - " + c.getSequence());
+		});
+	}
+
+	private static Double bestMatch(String authorSurname, String authorName, String contributor) {
+		logger.debug(authorSurname + " " + authorName + " vs " + contributor);
+		String[] contributorSplitted = contributor.split(" ");
+		if (contributorSplitted.length == 0) {
+			return 0.0;
+		}
+		final String contributorName = contributorSplitted[contributorSplitted.length - 1];
+		String contributorSurname = "";
+		if (contributorSplitted.length > 1) {
+			StringJoiner joiner = new StringJoiner(" ");
+			for (int i = 0; i < contributorSplitted.length - 1; i++) {
+				joiner.add(contributorSplitted[i]);
+			}
+			contributorSurname = joiner.toString();
+		}
+		logger
+			.debug(
+				"contributorName: " + contributorName +
+					" contributorSurname: " + contributorSurname);
+		String authorNameNrm = normalize(authorName);
+		String authorSurnameNrm = normalize(authorSurname);
+		String contributorNameNrm = normalize(contributorName);
+		String contributorSurnameNrm = normalize(contributorSurname);
+		Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm);
+		Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm);
+		if (sm1.compareTo(sm2) >= 0) {
+			return sm1;
+		}
+		return sm2;
+	}
+
+	private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
+		Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB);
+		logger
+			.debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + "   score: " + Double.toString(score));
+		return score;
+	}
+
+	private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) {
+		return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
+	}
+
+	private static String normalize(final String s) {
+		return nfd(s)
+			.toLowerCase()
+			// do not compact the regexes in a single expression, would cause StackOverflowError
+			// in case
+			// of large input strings
+			.replaceAll("(\\W)+", " ")
+			.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
+			.replaceAll("(\\p{Punct})+", " ")
+			.replaceAll("(\\d)+", " ")
+			.replaceAll("(\\n)+", " ")
+			.trim();
+	}
+
+	private static String nfd(final String s) {
+		return Normalizer.normalize(s, Normalizer.Form.NFD);
+	}
+
+	private static String parse(String name, String surname) {
+		return surname + " " + name;
+	}
+
+	private static void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
+		contributors.forEach(c -> {
+			if (c.isSimpleMatch()) {
+				logger.info("simple match on : " + c.getCreditName());
+				c.setName(author.getName());
+				c.setSurname(author.getSurname());
+				c.setOid(author.getOid());
+			}
+		});
+		updateRanks(contributors);
+	}
+
+	private static void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
+		logger.info("inside updateAuthorsSimilarityMatch ...");
+		contributors.forEach(c -> {
+			logger
+				.info(
+					c.getOid() + " - " + c.getCreditName() + " - " +
+						c.getName() + " - " + c.getSurname() + " - " +
+						c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: "
+						+ c.isSimpleMatch());
+		});
+
+		contributors
+			.stream()
+			.filter(c -> c.isBestMatch())
+			.forEach(c -> {
+				logger.info("similarity match on : " + c.getCreditName());
+				c.setName(author.getName());
+				c.setSurname(author.getSurname());
+				c.setOid(author.getOid());
+			});
+		updateRanks(contributors);
+	}
+
+	private static void updateRanks(List<Contributor> contributors) {
+		boolean seqFound = false;
+		if (contributors
+			.stream()
+			.filter(
+				c -> c.getRole() != null && c.getSequence() != null &&
+					c.getRole().equals("author") && (c.getSequence().equals("first") ||
+						c.getSequence().equals("additional")))
+			.count() > 0) {
+			seqFound = true;
+			logger.info("sequence data found");
+		}
+		if (!seqFound) {
+			List<Integer> seqIds = Arrays.asList(0);
+			contributors.forEach(c -> {
+				int currentSeq = seqIds.get(0) + 1;
+				seqIds.set(0, currentSeq);
+				c.setSequence(Integer.toString(seqIds.get(0)));
+			});
+		}
+	}
+
+	private static String toJson(WorkDataNoDoi work) {
+		GsonBuilder builder = new GsonBuilder();
+		Gson gson = builder.create();
+		return gson.toJson(work);
+	}
+}
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml
@ -0,0 +1,22 @@
+<configuration>
+    <property>
+        <name>oozie.action.sharelib.for.java</name>
+        <value>spark2</value>
+    </property>
+    <property>
+        <name>oozie.launcher.mapreduce.user.classpath.first</name>
+        <value>true</value>
+    </property>
+    <property>
+      <name>oozie.launcher.mapreduce.map.java.opts</name>
+      <value>-Xmx4g</value>
+    </property>
+    <property>
+        <name>jobTracker</name>
+        <value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml
@ -0,0 +1,524 @@
+<workflow-app name="Gen Enriched Orcid Works" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>workingPath_activities</name>
+            <description>the working dir base path</description>
+        </property>
+        <property>
+            <name>shell_cmd_0</name>
+            <value>wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 0</description>
+        </property>
+        <property>
+            <name>shell_cmd_1</name>
+            <value>wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 1</description>
+        </property>
+        <property>
+            <name>shell_cmd_2</name>
+            <value>wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 2</description>
+        </property>
+        <property>
+            <name>shell_cmd_3</name>
+            <value>wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 3</description>
+        </property> 
+        <property>
+            <name>shell_cmd_4</name>
+            <value>wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 4</description>
+        </property> 
+        <property>
+            <name>shell_cmd_5</name>
+            <value>wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 5</description>
+        </property>  
+        <property>
+            <name>shell_cmd_6</name>
+            <value>wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 6</description>
+        </property>
+        <property>
+            <name>shell_cmd_7</name>
+            <value>wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 7</description>
+        </property>
+        <property>
+            <name>shell_cmd_8</name>
+            <value>wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 8</description>
+        </property>
+        <property>
+            <name>shell_cmd_9</name>
+            <value>wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file 9</description>
+        </property> 
+        <property>
+            <name>shell_cmd_X</name>
+            <value>wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz
+            </value>
+            <description>the shell command that downloads and puts to hdfs orcid activity file X</description>
+        </property>  
+    </parameters>
+    
+    <start to="ResetWorkingPath"/>
+    
+    
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+    
+    <action name="ResetWorkingPath">
+        <fs>
+            <delete path='${workingPath_activities}/no_doi_works/*'/>
+            <delete path='${workingPath_activities}/no_doi_enriched_works/*'/>
+        </fs>
+        <ok to="fork_gen_orcid_author_work"/>
+        <error to="Kill"/>
+    </action>
+    
+    <fork name = "fork_gen_orcid_author_work">
+      <path start = "check_exist_on_hdfs_activities_0"/>
+      <path start = "check_exist_on_hdfs_activities_1"/>
+      <path start = "check_exist_on_hdfs_activities_2"/>
+	  <path start = "check_exist_on_hdfs_activities_3"/>
+	  <path start = "check_exist_on_hdfs_activities_4"/>
+	  <path start = "check_exist_on_hdfs_activities_5"/>
+	  <path start = "check_exist_on_hdfs_activities_6"/>
+	  <path start = "check_exist_on_hdfs_activities_7"/>
+	  <path start = "check_exist_on_hdfs_activities_8"/>
+	  <path start = "check_exist_on_hdfs_activities_9"/>
+	  <path start = "check_exist_on_hdfs_activities_X"/>
+   	</fork>
+   	
+    <decision name="check_exist_on_hdfs_activities_0">
+         <switch>
+            <case to="GenOrcidAuthorWork_0">
+              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_0.tar.gz'))}
+            </case>
+            <default to="Download_0" />
+         </switch>
+	</decision>
+	
+    <action name="Download_0">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_0}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_0"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_0">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath_activities}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_0.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_0.seq</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_1">
+         <switch>
+            <case to="GenOrcidAuthorWork_1">
+              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_1.tar.gz'))}
+            </case>
+            <default to="Download_1" />
+         </switch>
+	</decision>
+	
+    <action name="Download_1">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_1}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_1"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_1">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath_activities}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_1.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_1.seq</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_2">
+         <switch>
+            <case to="GenOrcidAuthorWork_2">
+              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_2.tar.gz'))}
+            </case>
+            <default to="Download_2" />
+         </switch>
+	</decision>
+	
+    <action name="Download_2">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_2}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_2"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_2">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath_activities}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_2.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_2.seq</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_3">
+         <switch>
+            <case to="GenOrcidAuthorWork_3">
+              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_3.tar.gz'))}
+            </case>
+            <default to="Download_3" />
+         </switch>
+	</decision>
+	
+    <action name="Download_3">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_3}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_3"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_3">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath_activities}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_3.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_3.seq</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_4">
+         <switch>
+            <case to="GenOrcidAuthorWork_4">
+              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_4.tar.gz'))}
+            </case>
+            <default to="Download_4" />
+         </switch>
+	</decision>
+	
+    <action name="Download_4">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_4}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_4"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_4">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath_activities}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_4.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_4.seq</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_5">
+         <switch>
+            <case to="GenOrcidAuthorWork_5">
+              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_5.tar.gz'))}
+            </case>
+            <default to="Download_5" />
+         </switch>
+	</decision>
+	
+    <action name="Download_5">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_5}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_5"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_5">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath_activities}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_5.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_5.seq</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_6">
+         <switch>
+            <case to="GenOrcidAuthorWork_6">
+              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_6.tar.gz'))}
+            </case>
+            <default to="Download_6" />
+         </switch>
+	</decision>
+	
+    <action name="Download_6">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_6}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_6"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_6">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath_activities}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_6.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_6.seq</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    
+    <decision name="check_exist_on_hdfs_activities_7">
+         <switch>
+            <case to="GenOrcidAuthorWork_7">
+              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_7.tar.gz'))}
+            </case>
+            <default to="Download_7" />
+         </switch>
+	</decision>
+	
+    <action name="Download_7">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_7}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_7"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_7">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath_activities}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_7.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_7.seq</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_8">
+         <switch>
+            <case to="GenOrcidAuthorWork_8">
+              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_8.tar.gz'))}
+            </case>
+            <default to="Download_8" />
+         </switch>
+	</decision>
+	
+    <action name="Download_8">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_8}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_8"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_8">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath_activities}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_8.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_8.seq</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_9">
+         <switch>
+            <case to="GenOrcidAuthorWork_9">
+              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_9.tar.gz'))}
+            </case>
+            <default to="Download_9" />
+         </switch>
+	</decision>
+	
+    <action name="Download_9">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_9}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_9"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_9">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath_activities}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_9.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_9.seq</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <decision name="check_exist_on_hdfs_activities_X">
+         <switch>
+            <case to="GenOrcidAuthorWork_X">
+              ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_X.tar.gz'))}
+            </case>
+            <default to="Download_X" />
+         </switch>
+	</decision>
+	
+    <action name="Download_X">
+		<shell xmlns="uri:oozie:shell-action:0.1">
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<exec>bash</exec>
+	    <argument>-c</argument>
+	    <argument>${shell_cmd_X}</argument>
+		<capture-output/>
+		</shell>
+	<ok to="GenOrcidAuthorWork_X"/>
+	<error to="Kill"/>
+	</action>
+	
+	<action name="GenOrcidAuthorWork_X">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
+            <arg>-w</arg><arg>${workingPath_activities}/</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-f</arg><arg>ORCID_2019_activites_X.tar.gz</arg>
+            <arg>-ow</arg><arg>no_doi_works/works_X.seq</arg>
+        </java>
+        <ok to="join_node"/>
+        <error to="Kill"/>
+    </action>
+    
+    <join name = "join_node" to = "Gen_Enriched_Orcid_Works"/>
+
+    <action name="Gen_Enriched_Orcid_Works">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Gen_Enriched_Orcid_Works</name>
+            <class>eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks</class>
+            <jar>dhp-doiboost-1.2.3-SNAPSHOT.jar</jar>
+            <spark-opts>--num-executors 10 --conf spark.yarn.jars=&quot;hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2&quot; --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory}
+            </spark-opts>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-ow</arg><arg>no_doi_works/</arg>
+            <arg>-oew</arg><arg>no_doi_enriched_works/</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+    
+   <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json
@ -0,0 +1,7 @@
+[
+ {"paramName":"n",   "paramLongName":"hdfsServerUri",	"paramDescription": "the server uri",   "paramRequired": true},
+ {"paramName":"w",   "paramLongName":"workingPath",	"paramDescription": "the default work path",	"paramRequired": true},
+ {"paramName":"f",   "paramLongName":"activitiesFileNameTarGz",	"paramDescription": "the name of the activities orcid file",	"paramRequired": true},
+ {"paramName":"ow",   "paramLongName":"outputWorksPath",	"paramDescription": "the relative folder of the sequencial file to write",	"paramRequired": true},
+ {"paramName":"oew",   "paramLongName":"outputEnrichedWorksPath",	"paramDescription": "the relative folder of the sequencial file to write the data",	"paramRequired": true}
+]
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java
@ -1,15 +1,12 @@

 package eu.dnetlib.doiboost.orcidnodoi.xml;

-import com.ximpleware.NavException;
-import com.ximpleware.ParseException;
-import com.ximpleware.XPathEvalException;
-import com.ximpleware.XPathParseException;
-import eu.dnetlib.dhp.parser.utility.VtdException;
-import eu.dnetlib.doiboost.orcid.model.AuthorData;
-import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
-import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
-import jdk.nashorn.internal.ir.annotations.Ignore;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import java.io.IOException;
+import java.text.Normalizer;
+import java.util.*;
+
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.text.similarity.JaccardSimilarity;
 import org.apache.commons.text.similarity.JaroWinklerSimilarity;
@ -17,11 +14,20 @@ import org.junit.jupiter.api.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

-import java.io.IOException;
-import java.text.Normalizer;
-import java.util.*;
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+import com.ximpleware.NavException;
+import com.ximpleware.ParseException;
+import com.ximpleware.XPathEvalException;
+import com.ximpleware.XPathParseException;

-import static org.junit.jupiter.api.Assertions.assertNotNull;
+import eu.dnetlib.dhp.parser.utility.VtdException;
+import eu.dnetlib.dhp.schema.oaf.Author;
+import eu.dnetlib.doiboost.orcid.model.AuthorData;
+import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
+import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
+import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
+import jdk.nashorn.internal.ir.annotations.Ignore;

 public class OrcidNoDoiTest {

@ -33,100 +39,10 @@ public class OrcidNoDoiTest {
 	String nameB = "K";
 	String surnameB = "Abdel-Dayem";
 	String orcidIdA = "0000-0003-2760-1191";
-	Double threshold = 0.8;

 	@Test
 	@Ignore
-	private void similarityTest() throws Exception {
-		logger.info("running testSimilarity ....");
-		logger
-			.info(
-				"JaroWinklerSimilarity: "
-					+ Double.toString(similarityJaroWinkler(nameA, surnameA, nameB, surnameB)));
-		logger
-			.info(
-				"JaccardSimilarity: " + Double.toString(similarityJaccard(nameA, surnameA, nameB, surnameB)));
-	}
-
-	@Test
-	@Ignore
-	private void bestMatchTest() throws Exception {
-		logger.info("running bestMatchTest ....");
-		String contributor = surnameB + ", " + nameB;
-		logger.info("score: " + Double.toString(bestMatch(surnameA, nameA, contributor)));
-	}
-
-	private static Double bestMatch(String authorSurname, String authorName, String contributor) {
-		logger.debug(authorSurname + " " + authorName + " vs " + contributor);
-		String[] contributorSplitted = contributor.split(" ");
-		if (contributorSplitted.length == 0) {
-			return 0.0;
-		}
-		final String contributorName = contributorSplitted[contributorSplitted.length - 1];
-		String contributorSurname = "";
-		if (contributorSplitted.length > 1) {
-			StringJoiner joiner = new StringJoiner(" ");
-			for (int i = 0; i < contributorSplitted.length - 1; i++) {
-				joiner.add(contributorSplitted[i]);
-			}
-			contributorSurname = joiner.toString();
-		}
-		logger
-			.debug(
-				"contributorName: " + contributorName +
-					" contributorSurname: " + contributorSurname);
-		String authorNameNrm = normalize(authorName);
-		String authorSurnameNrm = normalize(authorSurname);
-		String contributorNameNrm = normalize(contributorName);
-		String contributorSurnameNrm = normalize(contributorSurname);
-		Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm);
-		Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm);
-		if (sm1.compareTo(sm2) >= 0) {
-			return sm1;
-		}
-		return sm2;
-	}
-
-	private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
-		Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB);
-		logger
-			.debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + "   score: " + Double.toString(score));
-		return score;
-	}
-
-	private static Double similarityJaccard(String nameA, String surnameA, String nameB, String surnameB) {
-		return new JaccardSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
-	}
-
-	private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) {
-		return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
-	}
-
-	private static String parse(String name, String surname) {
-		return surname + " " + name;
-	}
-
-	private static String normalize(final String s) {
-		return nfd(s)
-			.toLowerCase()
-			// do not compact the regexes in a single expression, would cause StackOverflowError
-			// in case
-			// of large input strings
-			.replaceAll("(\\W)+", " ")
-			.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
-			.replaceAll("(\\p{Punct})+", " ")
-			.replaceAll("(\\d)+", " ")
-			.replaceAll("(\\n)+", " ")
-			.trim();
-	}
-
-	private static String nfd(final String s) {
-		return Normalizer.normalize(s, Normalizer.Form.NFD);
-	}
-
-	@Test
-	@Ignore
-	public void readPublicationFieldsTest()
+	private void readPublicationFieldsTest()
 		throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
 		logger.info("running loadPublicationFieldsTest ....");
 		String xml = IOUtils
@ -178,78 +94,10 @@ public class OrcidNoDoiTest {

 	}

-	private void updateRanks(List<Contributor> contributors) {
-		boolean seqFound = false;
-		if (contributors
-			.stream()
-			.filter(
-				c -> c.getRole() != null && c.getSequence() != null &&
-					c.getRole().equals("author") && (c.getSequence().equals("first") ||
-						c.getSequence().equals("additional")))
-			.count() > 0) {
-			seqFound = true;
-			logger.info("sequence data found");
-		}
-		if (!seqFound) {
-			List<Integer> seqIds = Arrays.asList(0);
-			contributors.forEach(c -> {
-				int currentSeq = seqIds.get(0) + 1;
-				seqIds.set(0, currentSeq);
-				c.setSequence(Integer.toString(seqIds.get(0)));
-			});
-		}
-	}
-
-	private void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
-		contributors.forEach(c -> {
-			if (c.isSimpleMatch()) {
-				logger.info("simple match on : " + c.getCreditName());
-				c.setName(author.getName());
-				c.setSurname(author.getSurname());
-				c.setOid(author.getOid());
-			}
-		});
-		updateRanks(contributors);
-	}
-
-	private void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
-		logger.info("inside updateAuthorsSimilarityMatch ...");
-		contributors.forEach(c -> {
-			logger
-				.info(
-					c.getOid() + " - " + c.getCreditName() + " - " +
-						c.getName() + " - " + c.getSurname() + " - " +
-						c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: "
-						+ c.isSimpleMatch());
-		});
-
-		contributors
-			.stream()
-			.filter(c -> c.isBestMatch())
-			.forEach(c -> {
-				logger.info("similarity match on : " + c.getCreditName());
-				c.setName(author.getName());
-				c.setSurname(author.getSurname());
-				c.setOid(author.getOid());
-			});
-		updateRanks(contributors);
-	}
-
 	@Test
-	@Ignore
-	public void authorSimilarityMatchTest() throws Exception {
-		logger.info("running authorSimilarityMatchTest ....");
-		authorMatchTest("activity_work_0000-0003-2760-1191-similarity.xml");
-	}
-
-	@Test
-	private void authorSimpleMatchTest() throws Exception {
+	public void authorMatchTest() throws Exception {
 		logger.info("running authorSimpleMatchTest ....");
-		authorMatchTest("activity_work_0000-0003-2760-1191.xml");
-	}
-
-	private void authorMatchTest(String orcidWork)
-		throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
+		String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml";
 		AuthorData author = new AuthorData();
 		author.setName(nameA);
 		author.setSurname(surnameA);
@ -272,55 +120,9 @@ public class OrcidNoDoiTest {
 			logger.error("parsing xml", e);
 		}
 		assertNotNull(workData);
-		int matchCounter = 0;
-		List<Integer> matchCounters = Arrays.asList(matchCounter);
-		Contributor contributor = null;
-		workData.getContributors().forEach(c -> {
-			if (normalize(c.getCreditName()).contains(normalize(author.getName())) ||
-				normalize(c.getCreditName()).contains(normalize(author.getSurname())) ||
-				((author.getOtherName() != null)
-					&& normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) {
-				matchCounters.set(0, matchCounters.get(0) + 1);
-				c.setSimpleMatch(true);
-			}
-		});
-		logger.info("match counter: " + Integer.toString(matchCounters.get(0)));
-		if (matchCounters.get(0) == 1) {
-			updateAuthorsSimpleMatch(workData.getContributors(), author);
-		} else if (matchCounters.get(0) > 1) {
-			Optional<Contributor> optCon = workData
-				.getContributors()
-				.stream()
-				.filter(c -> c.isSimpleMatch())
-				.map(c -> {
-					c.setScore(bestMatch(nameA, surnameA, c.getCreditName()));
-					logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore());
-					return c;
-				})
-				.filter(c -> c.getScore() >= threshold)
-				.max(Comparator.comparing(c -> c.getScore()));
-			Contributor bestMatchContributor = null;
-			if (optCon.isPresent()) {
-				bestMatchContributor = optCon.get();
-				bestMatchContributor.setBestMatch(true);
-				logger.info("best match: " + bestMatchContributor.getCreditName());
-				updateAuthorsSimilarityMatch(workData.getContributors(), author);
-			}
-
-		}
-
-		logger.info("UPDATED contributors: ");
-		workData.getContributors().forEach(c -> {
-			logger
-				.info(
-					c.getOid() + " - " + c.getCreditName() + " - " +
-						c.getName() + " - " + c.getSurname() + " - " +
-						c.getRole() + " - " + c.getSequence());
-		});
+		AuthorMatcher.match(author, workData.getContributors());
+		GsonBuilder builder = new GsonBuilder();
+		Gson gson = builder.create();
+		logger.info(gson.toJson(workData));
 	}
 }
-
-//
-//		orcid_RDD = sc.textFile(ORCID_DUMP_PATH)
-//		no_doi_works_RDD = orcid_RDD.map(orcid_map).filter(lambda x:x is not None).map(lambda x: json.dumps(x)).saveAsTextFile(path=ORCID_OPENAIRE_PATH,compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
-//