Merge branch 'beta' into dependency-revision

2024-05-02 11:54:59 +02:00 · 2024-05-02 11:54:59 +02:00 · 591b76047f
parent 0173bf02f3 00ad21d814
commit 591b76047f
14 changed files with 178 additions and 2721 deletions
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@ -63,11 +63,13 @@
 	<dependencies>
 		<dependency>
-			<groupId>eu.dnetlib.dhp</groupId>
+			<groupId>edu.cmu</groupId>
-			<artifactId>dhp-pace-core</artifactId>
+			<artifactId>secondstring</artifactId>
-			<version>${project.version}</version>
+		</dependency>
 		<dependency>
 			<groupId>com.ibm.icu</groupId>
 			<artifactId>icu4j</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>org.apache.hadoop</groupId>
 			<artifactId>hadoop-common</artifactId>
--- a/dhp-common/src/main/java/eu/dnetlib/pace/common/PaceCommonUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/pace/common/PaceCommonUtils.java
@ -0,0 +1,100 @@
 package eu.dnetlib.pace.common;
 import com.google.common.base.Splitter;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Sets;
 import com.ibm.icu.text.Transliterator;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import java.nio.charset.StandardCharsets;
 import java.text.Normalizer;
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 /**
 * Set of common functions for the framework
 *
 * @author claudio
 */
 public class PaceCommonUtils {
 	// transliterator
 	protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
 	protected static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
 	protected static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
 	protected static Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
 	protected static String fixAliases(final String s) {
 		final StringBuilder sb = new StringBuilder();
 		s.chars().forEach(ch -> {
 			final int i = StringUtils.indexOf(aliases_from, ch);
 			sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch);
 		});
 		return sb.toString();
 	}
 	protected static String transliterate(final String s) {
 		try {
 			return transliterator.transliterate(s);
 		} catch (Exception e) {
 			return s;
 		}
 	}
 	public static String normalize(final String s) {
 		return fixAliases(transliterate(nfd(unicodeNormalization(s))))
 			.toLowerCase()
 			// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
 			// strings
 			.replaceAll("[^ \\w]+", "")
 			.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
 			.replaceAll("(\\p{Punct})+", " ")
 			.replaceAll("(\\d)+", " ")
 			.replaceAll("(\\n)+", " ")
 			.trim();
 	}
 	public static String nfd(final String s) {
 		return Normalizer.normalize(s, Normalizer.Form.NFD);
 	}
 	public static String unicodeNormalization(final String s) {
 		Matcher m = hexUnicodePattern.matcher(s);
 		StringBuffer buf = new StringBuffer(s.length());
 		while (m.find()) {
 			String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
 			m.appendReplacement(buf, Matcher.quoteReplacement(ch));
 		}
 		m.appendTail(buf);
 		return buf.toString();
 	}
 	public static Set<String> loadFromClasspath(final String classpath) {
 		Transliterator transliterator = Transliterator.getInstance("Any-Eng");
 		final Set<String> h = Sets.newHashSet();
 		try {
 			for (final String s : IOUtils
 				.readLines(PaceCommonUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
 				h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
 			}
 		} catch (final Throwable e) {
 			return Sets.newHashSet();
 		}
 		return h;
 	}
 	protected static Iterable<String> tokens(final String s, final int maxTokens) {
 		return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
 	}
 }
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java
@ -1,21 +1,20 @@
 package eu.dnetlib.pace.model;
 import java.nio.charset.Charset;
 import java.text.Normalizer;
 import java.util.List;
 import java.util.Set;
 import com.google.common.base.Joiner;
 import com.google.common.base.Splitter;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Lists;
 import com.google.common.hash.Hashing;
-
+import eu.dnetlib.pace.common.PaceCommonUtils;
 import eu.dnetlib.pace.common.AbstractPaceFunctions;
 import eu.dnetlib.pace.util.Capitalise;
 import eu.dnetlib.pace.util.DotAbbreviations;
 import java.nio.charset.Charset;
 import java.text.Normalizer;
 import java.util.List;
 import java.util.Set;
 public class Person {
 	private static final String UTF8 = "UTF-8";
@ -86,7 +85,7 @@ public class Person {
 	private List<String> splitTerms(final String s) {
 		if (particles == null) {
-			particles = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
+			particles = PaceCommonUtils.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
 		}
 		final List<String> list = Lists.newArrayList();
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/Capitalise.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/Capitalise.java
@ -1,9 +1,8 @@
 package eu.dnetlib.pace.util;
 import org.apache.commons.lang3.text.WordUtils;
 import com.google.common.base.Function;
 import org.apache.commons.lang3.text.WordUtils;
 public class Capitalise implements Function<String, String> {
@ -15,4 +14,4 @@ public class Capitalise implements Function<String, String> {
 	public String apply(final String s) {
 		return WordUtils.capitalize(s.toLowerCase(), DELIM);
 	}
-};
+}
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DotAbbreviations.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DotAbbreviations.java
@ -8,4 +8,4 @@ public class DotAbbreviations implements Function<String, String> {
 	public String apply(String s) {
 		return s.length() == 1 ? s + "." : s;
 	}
-};
+}
--- a/dhp-pace-core/src/main/resources/eu/dnetlib/pace/config/name_particles.txt
+++ b/dhp-pace-core/src/main/resources/eu/dnetlib/pace/config/name_particles.txt
--- a/dhp-pace-core/pom.xml
+++ b/dhp-pace-core/pom.xml
@ -49,6 +49,12 @@
 	</build>
 	<dependencies>
 		<dependency>
 			<groupId>eu.dnetlib.dhp</groupId>
 			<artifactId>dhp-common</artifactId>
 			<version>${project.version}</version>
 		</dependency>
 		<dependency>
 			<groupId>edu.cmu</groupId>
 			<artifactId>secondstring</artifactId>
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
@ -1,32 +1,26 @@
 package eu.dnetlib.pace.common;
 import com.google.common.base.Joiner;
 import com.google.common.collect.Sets;
 import com.ibm.icu.text.Transliterator;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import java.io.IOException;
 import java.io.StringWriter;
 import java.nio.charset.StandardCharsets;
 import java.text.Normalizer;
 import java.util.*;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import com.google.common.base.Joiner;
 import com.google.common.base.Splitter;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Sets;
 import com.ibm.icu.text.Transliterator;
 import eu.dnetlib.pace.clustering.NGramUtils;
 /**
 * Set of common functions for the framework
 *
 * @author claudio
 */
-public class AbstractPaceFunctions {
+public class AbstractPaceFunctions extends PaceCommonUtils {
 	// city map to be used when translating the city names into codes
 	private static Map<String, String> cityMap = AbstractPaceFunctions
@ -41,9 +35,6 @@ public class AbstractPaceFunctions {
 	protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
 	protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
 	// transliterator
 	protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
 	// blacklist of ngrams: to avoid generic keys
 	protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
@ -51,8 +42,6 @@ public class AbstractPaceFunctions {
 	public static final Pattern HTML_REGEX = Pattern.compile("<[^>]*>");
 	private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
 	private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
 	private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
 	// doi prefix for normalization
 	public static final Pattern DOI_PREFIX = Pattern.compile("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)");
@ -129,25 +118,6 @@ public class AbstractPaceFunctions {
 		return numberPattern.matcher(strNum).matches();
 	}
 	protected static String fixAliases(final String s) {
 		final StringBuilder sb = new StringBuilder();
 		s.chars().forEach(ch -> {
 			final int i = StringUtils.indexOf(aliases_from, ch);
 			sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch);
 		});
 		return sb.toString();
 	}
 	protected static String transliterate(final String s) {
 		try {
 			return transliterator.transliterate(s);
 		} catch (Exception e) {
 			return s;
 		}
 	}
 	protected static String removeSymbols(final String s) {
 		final StringBuilder sb = new StringBuilder();
@ -162,23 +132,6 @@ public class AbstractPaceFunctions {
 		return s != null;
 	}
 	public static String normalize(final String s) {
 		return fixAliases(transliterate(nfd(unicodeNormalization(s))))
 			.toLowerCase()
 			// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
 			// strings
 			.replaceAll("[^ \\w]+", "")
 			.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
 			.replaceAll("(\\p{Punct})+", " ")
 			.replaceAll("(\\d)+", " ")
 			.replaceAll("(\\n)+", " ")
 			.trim();
 	}
 	public static String nfd(final String s) {
 		return Normalizer.normalize(s, Normalizer.Form.NFD);
 	}
 	public static String utf8(final String s) {
 		byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
 		return new String(bytes, StandardCharsets.UTF_8);
@ -233,22 +186,6 @@ public class AbstractPaceFunctions {
 		return newset;
 	}
 	public static Set<String> loadFromClasspath(final String classpath) {
 		Transliterator transliterator = Transliterator.getInstance("Any-Eng");
 		final Set<String> h = Sets.newHashSet();
 		try {
 			for (final String s : IOUtils
 				.readLines(NGramUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
 				h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
 			}
 		} catch (final Throwable e) {
 			return Sets.newHashSet();
 		}
 		return h;
 	}
 	public static Map<String, String> loadMapFromClasspath(final String classpath) {
 		Transliterator transliterator = Transliterator.getInstance("Any-Eng");
@ -303,10 +240,6 @@ public class AbstractPaceFunctions {
 		return StringUtils.substring(s, 0, 1).toLowerCase();
 	}
 	protected static Iterable<String> tokens(final String s, final int maxTokens) {
 		return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
 	}
 	public static String normalizePid(String pid) {
 		return DOI_PREFIX.matcher(pid.toLowerCase()).replaceAll("");
 	}
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/consistency/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/consistency/oozie_app/workflow.xml
@ -102,6 +102,8 @@
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=15000
                --conf spark.network.timeout=300s
                --conf spark.shuffle.registration.timeout=50000
            </spark-opts>
            <arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
            <arg>--graphOutputPath</arg><arg>${graphOutputPath}</arg>
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml
@ -33,16 +33,14 @@
            <description>max number of elements in a connected component</description>
        </property>
        <property>
-            <name>sparkDriverMemory</name>
+            <name>sparkResourceOpts</name>
-            <description>memory for driver process</description>
+            <value>--executor-memory=6G --conf spark.executor.memoryOverhead=4G --executor-cores=6 --driver-memory=8G --driver-cores=4</value>
            <description>spark resource options</description>
        </property>
        <property>
-            <name>sparkExecutorMemory</name>
+            <name>sparkResourceOptsCreateMergeRel</name>
-            <description>memory for individual executor</description>
+            <value>--executor-memory=6G --conf spark.executor.memoryOverhead=4G --executor-cores=6 --driver-memory=8G --driver-cores=4</value>
-        </property>
+            <description>spark resource options</description>
        <property>
            <name>sparkExecutorCores</name>
            <description>number of cores used by single executor</description>
        </property>
        <property>
            <name>oozieActionShareLibForSpark2</name>
@ -119,9 +117,7 @@
            <class>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
+                ${sparkResourceOpts}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -146,9 +142,7 @@
            <class>eu.dnetlib.dhp.oa.dedup.SparkWhitelistSimRels</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
+                ${sparkResourceOpts}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -174,9 +168,7 @@
            <class>eu.dnetlib.dhp.oa.dedup.SparkCreateMergeRels</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
+                ${sparkResourceOptsCreateMergeRel}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -203,9 +195,7 @@
            <class>eu.dnetlib.dhp.oa.dedup.SparkCreateDedupRecord</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
+                ${sparkResourceOpts}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -230,9 +220,7 @@
            <class>eu.dnetlib.dhp.oa.dedup.SparkCopyOpenorgsMergeRels</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
+                ${sparkResourceOpts}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -257,9 +245,7 @@
            <class>eu.dnetlib.dhp.oa.dedup.SparkCreateOrgsDedupRecord</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
+                ${sparkResourceOpts}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -283,9 +269,7 @@
            <class>eu.dnetlib.dhp.oa.dedup.SparkUpdateEntity</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
+                ${sparkResourceOpts}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -309,9 +293,7 @@
            <class>eu.dnetlib.dhp.oa.dedup.SparkCopyRelationsNoOpenorgs</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
+                ${sparkResourceOpts}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/oozie_app/workflow.xml
@ -100,16 +100,12 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.dynamicAllocation.enabled=true
+                --conf spark.sql.shuffle.partitions=8000
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
                --conf spark.sql.shuffle.partitions=3840
                --conf spark.speculation=false
                --conf spark.hadoop.mapreduce.map.speculative=false
                --conf spark.hadoop.mapreduce.reduce.speculative=false
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
            <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
@ -132,12 +128,11 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.dynamicAllocation.enabled=true
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
            <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
@ -160,12 +155,11 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.dynamicAllocation.enabled=true
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
            <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
@ -188,12 +182,11 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.dynamicAllocation.enabled=true
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
            <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
@ -218,12 +211,11 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.dynamicAllocation.enabled=true
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${workingDir}/orcid/targetOrcidAssoc</arg>
            <arg>--outputPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
@ -247,19 +239,14 @@
            <class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class>
            <jar>dhp-enrichment-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-cores=4
+                --executor-cores=${sparkExecutorCores}
-                --executor-memory=4G
+                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=5G
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.dynamicAllocation.enabled=true
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
                --conf spark.speculation=false
                --conf spark.hadoop.mapreduce.map.speculative=false
                --conf spark.hadoop.mapreduce.reduce.speculative=false
                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
@ -282,15 +269,12 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.dynamicAllocation.enabled=true
+                --conf spark.sql.shuffle.partitions=8000
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
                --conf spark.speculation=false
                --conf spark.hadoop.mapreduce.map.speculative=false
                --conf spark.hadoop.mapreduce.reduce.speculative=false
            </spark-opts>
            <arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
            <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
@ -312,15 +296,12 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.dynamicAllocation.enabled=true
+                --conf spark.sql.shuffle.partitions=8000
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
                --conf spark.speculation=false
                --conf spark.hadoop.mapreduce.map.speculative=false
                --conf spark.hadoop.mapreduce.reduce.speculative=false
            </spark-opts>
            <arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
            <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
@ -342,15 +323,12 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.dynamicAllocation.enabled=true
+                --conf spark.sql.shuffle.partitions=4000
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
                --conf spark.speculation=false
                --conf spark.hadoop.mapreduce.map.speculative=false
                --conf spark.hadoop.mapreduce.reduce.speculative=false
            </spark-opts>
            <arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
            <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
@ -362,15 +340,6 @@
    </action>
    <join name="wait2" to="End"/>
 <!--    <action name="reset_workingDir">-->
 <!--        <fs>-->
 <!--            <delete path="${workingDir}"/>-->
 <!--            <mkdir path="${workingDir}"/>-->
 <!--        </fs>-->
 <!--        <ok to="End"/>-->
 <!--        <error to="Kill"/>-->
 <!--    </action>-->
    <end name="End"/>
--- a/dhp-workflows/dhp-graph-mapper/pom.xml
+++ b/dhp-workflows/dhp-graph-mapper/pom.xml
@ -90,6 +90,12 @@
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-pace-core</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>com.jayway.jsonpath</groupId>
            <artifactId>json-path</artifactId>
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@ -71,6 +71,7 @@
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
 				--driver-memory=${sparkHighDriverMemory}
 				--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
 				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
 				--conf spark.extraListeners=${spark2ExtraListeners}
 				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -108,6 +109,7 @@
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
 				--driver-memory=${sparkNormalDriverMemory}
 				--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
 				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
 				--conf spark.extraListeners=${spark2ExtraListeners}
 				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -141,6 +143,7 @@
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
 				--driver-memory=${sparkNormalDriverMemory}
 				--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
 				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
 				--conf spark.extraListeners=${spark2ExtraListeners}
 				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -176,6 +179,7 @@
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
 				--driver-memory=${sparkNormalDriverMemory}
 				--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
 				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
 				--conf spark.extraListeners=${spark2ExtraListeners}
 				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -209,6 +213,7 @@
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
 				--driver-memory=${sparkNormalDriverMemory}
 				--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
 				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
 				--conf spark.extraListeners=${spark2ExtraListeners}
 				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -245,6 +250,7 @@
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
 				--driver-memory=${sparkNormalDriverMemory}
 				--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
 				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
 				--conf spark.extraListeners=${spark2ExtraListeners}
 				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -315,6 +321,7 @@
 				--executor-memory=${sparkNormalExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
 				--driver-memory=${sparkNormalDriverMemory}
 				--conf spark.executor.memoryOverhead=${sparkNormalExecutorMemory}
 				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
 				--conf spark.extraListeners=${spark2ExtraListeners}
 				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -361,6 +368,7 @@
 				--executor-memory=${sparkNormalExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
 				--driver-memory=${sparkNormalDriverMemory}
 				--conf spark.executor.memoryOverhead=${sparkNormalExecutorMemory}
 				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
 				--conf spark.extraListeners=${spark2ExtraListeners}
 				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -409,6 +417,7 @@
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
 				--driver-memory=${sparkHighDriverMemory}
 				--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
 				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
 				--conf spark.extraListeners=${spark2ExtraListeners}
 				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -444,6 +453,7 @@
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
 				--driver-memory=${sparkHighDriverMemory}
 				--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
 				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
 				--conf spark.extraListeners=${spark2ExtraListeners}
 				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -482,6 +492,7 @@
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
 				--driver-memory=${sparkNormalDriverMemory}
 				--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
 				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
 				--conf spark.extraListeners=${spark2ExtraListeners}
 				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -533,6 +544,7 @@
 				--executor-memory=${sparkNormalExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
 				--driver-memory=${sparkNormalDriverMemory}
 				--conf spark.executor.memoryOverhead=${sparkNormalExecutorMemory}
 				--conf spark.extraListeners=${spark2ExtraListeners}
 				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}