conflict resolution in the comparator test class

2024-11-18 14:59:30 +01:00 · 2024-11-18 14:59:30 +01:00 · c97facf5e6
parent 6c17993d16 cf7d9a32ab
commit c97facf5e6
91 changed files with 1954 additions and 804 deletions
--- a/.gitignore
+++ b/.gitignore
@ -28,3 +28,4 @@ spark-warehouse
 /**/.scalafmt.conf
 /.java-version
 /dhp-shade-package/dependency-reduced-pom.xml
 /**/job.properties
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/CoAuthorshipIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/CoAuthorshipIterator.java
@ -1,5 +1,5 @@
-package eu.dnetlib.dhp.actionmanager.personentity;
+package eu.dnetlib.dhp.common.person;
 import java.util.Arrays;
 import java.util.Iterator;
@ -61,7 +61,7 @@ public class CoAuthorshipIterator implements Iterator<Relation> {
 	private Relation getRelation(String orcid1, String orcid2) {
 		String source = PERSON_PREFIX + IdentifierFactory.md5(orcid1);
 		String target = PERSON_PREFIX + IdentifierFactory.md5(orcid2);
-		return OafMapperUtils
+		Relation relation = OafMapperUtils
 			.getRelation(
 				source, target, ModelConstants.PERSON_PERSON_RELTYPE,
 				ModelConstants.PERSON_PERSON_SUBRELTYPE,
@ -76,5 +76,7 @@ public class CoAuthorshipIterator implements Iterator<Relation> {
 								ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
 						"0.91"),
 				null);
 		relation.setValidated(true);
 		return relation;
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/Coauthors.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/Coauthors.java
@ -1,12 +1,9 @@
-package eu.dnetlib.dhp.actionmanager.personentity;
+package eu.dnetlib.dhp.common.person;
 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.List;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 public class Coauthors implements Serializable {
 	private List<String> coauthors;
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/GroupEntitiesSparkJob.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/GroupEntitiesSparkJob.java
@ -2,8 +2,7 @@
 package eu.dnetlib.dhp.oa.merge;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
-import static org.apache.spark.sql.functions.col;
+import static org.apache.spark.sql.functions.*;
 import static org.apache.spark.sql.functions.when;
 import java.util.Map;
 import java.util.Optional;
@ -135,7 +134,9 @@ public class GroupEntitiesSparkJob {
 					.applyCoarVocabularies(entity, vocs),
 				OAFENTITY_KRYO_ENC)
 			.groupByKey((MapFunction<OafEntity, String>) OafEntity::getId, Encoders.STRING())
-			.mapGroups((MapGroupsFunction<String, OafEntity, OafEntity>) MergeUtils::mergeById, OAFENTITY_KRYO_ENC)
+			.mapGroups(
 				(MapGroupsFunction<String, OafEntity, OafEntity>) (key, group) -> MergeUtils.mergeById(group, vocs),
 				OAFENTITY_KRYO_ENC)
 			.map(
 				(MapFunction<OafEntity, Tuple2<String, OafEntity>>) t -> new Tuple2<>(
 					t.getClass().getName(), t),
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
@ -2,7 +2,6 @@
 package eu.dnetlib.dhp.schema.oaf.utils;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.OPENAIRE_META_RESOURCE_TYPE;
 import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
 import java.net.MalformedURLException;
@ -363,6 +362,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 				// nothing to clean here
 			} else if (value instanceof Project) {
 				// nothing to clean here
 			} else if (value instanceof Person) {
 				// nothing to clean here
 			} else if (value instanceof Organization) {
 				Organization o = (Organization) value;
 				if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
@ -694,6 +695,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 						}
 					}
 					// set ORCID_PENDING to all orcid values that are not coming from ORCID provenance
 					for (Author a : r.getAuthor()) {
 						if (Objects.isNull(a.getPid())) {
 							a.setPid(Lists.newArrayList());
@ -750,6 +752,40 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 										.collect(Collectors.toList()));
 						}
 					}
 					// Identify clashing ORCIDS:that is same ORCID associated to multiple authors in this result
 					Map<String, Integer> clashing_orcid = new HashMap<>();
 					for (Author a : r.getAuthor()) {
 						a
 							.getPid()
 							.stream()
 							.filter(
 								p -> StringUtils
 									.contains(StringUtils.lowerCase(p.getQualifier().getClassid()), ORCID_PENDING))
 							.map(StructuredProperty::getValue)
 							.distinct()
 							.forEach(orcid -> clashing_orcid.compute(orcid, (k, v) -> (v == null) ? 1 : v + 1));
 					}
 					Set<String> clashing = clashing_orcid
 						.entrySet()
 						.stream()
 						.filter(ee -> ee.getValue() > 1)
 						.map(Map.Entry::getKey)
 						.collect(Collectors.toSet());
 					// filter out clashing orcids
 					for (Author a : r.getAuthor()) {
 						a
 							.setPid(
 								a
 									.getPid()
 									.stream()
 									.filter(p -> !clashing.contains(p.getValue()))
 									.collect(Collectors.toList()));
 					}
 				}
 				if (value instanceof Publication) {
@ -808,7 +844,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 		return author;
 	}
-	private static Optional<String> cleanDateField(Field<String> dateofacceptance) {
+	public static Optional<String> cleanDateField(Field<String> dateofacceptance) {
 		return Optional
 			.ofNullable(dateofacceptance)
 			.map(Field::getValue)
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java
@ -204,6 +204,7 @@ public class IdentifierFactory implements Serializable {
 			.map(
 				pp -> pp
 					.stream()
 					.filter(p -> StringUtils.isNotBlank(p.getValue()))
 					// filter away PIDs provided by a DS that is not considered an authority for the
 					// given PID Type
 					.filter(p -> shouldFilterPidByCriteria(collectedFrom, p, mapHandles))
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
@ -23,24 +23,30 @@ import org.apache.commons.lang3.tuple.Pair;
 import com.github.sisyphsu.dateparser.DateParserUtils;
 import com.google.common.base.Joiner;
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
 import eu.dnetlib.dhp.oa.merge.AuthorMerger;
 import eu.dnetlib.dhp.schema.common.AccessRightComparator;
 import eu.dnetlib.dhp.schema.common.EntityType;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;
 public class MergeUtils {
-	public static <T extends Oaf> T mergeById(String s, Iterator<T> oafEntityIterator) {
+	public static <T extends Oaf> T mergeById(Iterator<T> oafEntityIterator, VocabularyGroup vocs) {
-		return mergeGroup(s, oafEntityIterator, true);
+		return mergeGroup(oafEntityIterator, true, vocs);
 	}
-	public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator) {
+	public static <T extends Oaf> T mergeGroup(Iterator<T> oafEntityIterator) {
-		return mergeGroup(s, oafEntityIterator, false);
+		return mergeGroup(oafEntityIterator, false);
 	}
-	public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator,
+	public static <T extends Oaf> T mergeGroup(Iterator<T> oafEntityIterator, boolean checkDelegateAuthority) {
-		boolean checkDelegateAuthority) {
+		return mergeGroup(oafEntityIterator, checkDelegateAuthority, null);
 	}
 	public static <T extends Oaf> T mergeGroup(Iterator<T> oafEntityIterator,
 		boolean checkDelegateAuthority, VocabularyGroup vocs) {
 		ArrayList<T> sortedEntities = new ArrayList<>();
 		oafEntityIterator.forEachRemaining(sortedEntities::add);
@ -49,13 +55,55 @@ public class MergeUtils {
 		Iterator<T> it = sortedEntities.iterator();
 		T merged = it.next();
-		while (it.hasNext()) {
+		if (!it.hasNext() && merged instanceof Result && vocs != null) {
-			merged = checkedMerge(merged, it.next(), checkDelegateAuthority);
+			return enforceResultType(vocs, (Result) merged);
 		} else {
 			while (it.hasNext()) {
 				merged = checkedMerge(merged, it.next(), checkDelegateAuthority);
 			}
 		}
 		return merged;
 	}
 	private static <T extends Oaf> T enforceResultType(VocabularyGroup vocs, Result mergedResult) {
 		if (Optional.ofNullable(mergedResult.getInstance()).map(List::isEmpty).orElse(true)) {
 			return (T) mergedResult;
 		} else {
 			final Instance i = mergedResult.getInstance().get(0);
 			if (!vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
 				return (T) mergedResult;
 			} else {
 				final String expectedResultType = Optional
 					.ofNullable(
 						vocs
 							.lookupTermBySynonym(
 								ModelConstants.DNET_RESULT_TYPOLOGIES, i.getInstancetype().getClassid()))
 					.orElse(ModelConstants.ORP_DEFAULT_RESULTTYPE)
 					.getClassid();
 				// there is a clash among the result types
 				if (!expectedResultType.equals(mergedResult.getResulttype().getClassid())) {
 					Result result = (Result) Optional
 						.ofNullable(ModelSupport.oafTypes.get(expectedResultType))
 						.map(r -> {
 							try {
 								return r.newInstance();
 							} catch (InstantiationException | IllegalAccessException e) {
 								throw new IllegalStateException(e);
 							}
 						})
 						.orElse(new OtherResearchProduct());
 					result.setId(mergedResult.getId());
 					return (T) mergeResultFields(result, mergedResult);
 				} else {
 					return (T) mergedResult;
 				}
 			}
 		}
 	}
 	public static <T extends Oaf> T checkedMerge(final T left, final T right, boolean checkDelegateAuthority) {
 		return (T) merge(left, right, checkDelegateAuthority);
 	}
@ -106,7 +154,7 @@ public class MergeUtils {
 				return mergeSoftware((Software) left, (Software) right);
 			}
-			return mergeResultFields((Result) left, (Result) right);
+			return left;
 		} else if (sameClass(left, right, Datasource.class)) {
 			// TODO
 			final int trust = compareTrust(left, right);
@ -654,16 +702,9 @@ public class MergeUtils {
 	}
 	private static Field<String> selectOldestDate(Field<String> d1, Field<String> d2) {
-		if (d1 == null || StringUtils.isBlank(d1.getValue())) {
+		if (!GraphCleaningFunctions.cleanDateField(d1).isPresent()) {
 			return d2;
-		} else if (d2 == null || StringUtils.isBlank(d2.getValue())) {
+		} else if (!GraphCleaningFunctions.cleanDateField(d2).isPresent()) {
 			return d1;
 		}
 		if (StringUtils.contains(d1.getValue(), "null")) {
 			return d2;
 		}
 		if (StringUtils.contains(d2.getValue(), "null")) {
 			return d1;
 		}
@ -715,7 +756,11 @@ public class MergeUtils {
 	private static String spKeyExtractor(StructuredProperty sp) {
 		return Optional
 			.ofNullable(sp)
-			.map(s -> Joiner.on("||").join(qualifierKeyExtractor(s.getQualifier()), s.getValue()))
+			.map(
 				s -> Joiner
 					.on("||")
 					.useForNull("")
 					.join(qualifierKeyExtractor(s.getQualifier()), s.getValue()))
 			.orElse(null);
 	}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ModelHardLimits.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ModelHardLimits.java
@ -1,6 +1,12 @@
 package eu.dnetlib.dhp.schema.oaf.utils;
 import java.util.Map;
 import com.google.common.collect.Maps;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 public class ModelHardLimits {
 	private ModelHardLimits() {
@ -12,6 +18,7 @@ public class ModelHardLimits {
 	public static final int MAX_EXTERNAL_ENTITIES = 50;
 	public static final int MAX_AUTHORS = 200;
 	public static final int MAX_RELATED_AUTHORS = 20;
 	public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
 	public static final int MAX_TITLE_LENGTH = 5000;
 	public static final int MAX_TITLES = 10;
@ -19,6 +26,12 @@ public class ModelHardLimits {
 	public static final int MAX_ABSTRACT_LENGTH = 150000;
 	public static final int MAX_RELATED_ABSTRACT_LENGTH = 500;
 	public static final int MAX_INSTANCES = 10;
 	public static final Map<String, Long> MAX_RELATIONS_BY_RELCLASS = Maps.newHashMap();
 	static {
 		MAX_RELATIONS_BY_RELCLASS.put(ModelConstants.PERSON_PERSON_HASCOAUTHORED, 500L);
 		MAX_RELATIONS_BY_RELCLASS.put(ModelConstants.RESULT_PERSON_HASAUTHORED, 500L);
 	}
 	public static String getCollectionName(String format) {
 		return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidCleaner.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidCleaner.java
@ -26,7 +26,7 @@ public class PidCleaner {
 		String value = Optional
 			.ofNullable(pidValue)
 			.map(String::trim)
-			.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
+			.orElseThrow(() -> new IllegalArgumentException("PID (" + pidType + ") value cannot be empty"));
 		switch (pidType) {
--- a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java
@ -179,7 +179,7 @@ class OafMapperUtilsTest {
 		assertEquals(
 			ModelConstants.DATASET_RESULTTYPE_CLASSID,
 			((Result) MergeUtils
-				.merge(p2, d1))
+				.merge(p2, d1, true))
 					.getResulttype()
 					.getClassid());
 	}
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NumAuthorsTitleSuffixPrefixChain.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NumAuthorsTitleSuffixPrefixChain.java
@ -38,7 +38,7 @@ public class NumAuthorsTitleSuffixPrefixChain extends AbstractClusteringFunction
 	@Override
 	protected Collection<String> doApply(Config conf, String s) {
-		return suffixPrefixChain(cleanup(s), param("mod"));
+		return suffixPrefixChain(cleanup(s), paramOrDefault("mod", 10));
 	}
 	private Collection<String> suffixPrefixChain(String s, int mod) {
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
@ -90,7 +90,7 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
 		inferFrom = normalize(inferFrom);
 		inferFrom = filterAllStopWords(inferFrom);
 		Set<String> cities = getCities(inferFrom, 4);
-		return citiesToCountry(cities).stream().findFirst().orElse("UNKNOWN");
+		return citiesToCountry(cities).stream().filter(Objects::nonNull).findFirst().orElse("UNKNOWN");
 	}
 	public static String cityInference(String original) {
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java
@ -54,6 +54,22 @@ public class FieldDef implements Serializable {
 	public FieldDef() {
 	}
 	public FieldDef clone() {
 		FieldDef fieldDef = new FieldDef();
 		fieldDef.setName(this.name);
 		fieldDef.setPath(this.path);
 		fieldDef.setType(this.type);
 		fieldDef.setOverrideMatch(this.overrideMatch);
 		fieldDef.setSize(this.size);
 		fieldDef.setLength(this.length);
 		fieldDef.setFilter(this.filter);
 		fieldDef.setSorted(this.sorted);
 		fieldDef.setClean(this.clean);
 		fieldDef.setInfer(this.infer);
 		fieldDef.setInferenceFrom(this.inferenceFrom);
 		return fieldDef;
 	}
 	public String getInferenceFrom() {
 		return inferenceFrom;
 	}
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDeduper.scala
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDeduper.scala
@ -19,48 +19,10 @@ case class SparkDeduper(conf: DedupConfig) extends Serializable {
  val model: SparkModel = SparkModel(conf)
  val dedup: (Dataset[Row] => Dataset[Row]) = df => {
-    df.transform(filterAndCleanup)
+    df.transform(generateClustersWithCollect)
      .transform(generateClustersWithCollect)
      .transform(processBlocks)
  }
  val filterAndCleanup: (Dataset[Row] => Dataset[Row]) = df => {
    val df_with_filters = conf.getPace.getModel.asScala.foldLeft(df)((res, fdef) => {
      if (conf.blacklists.containsKey(fdef.getName)) {
        res.withColumn(
          fdef.getName + "_filtered",
          filterColumnUDF(fdef).apply(new Column(fdef.getName))
        )
      } else {
        res
      }
    })
    df_with_filters
  }
  def filterColumnUDF(fdef: FieldDef): UserDefinedFunction = {
    val blacklist: Predicate[String] = conf.blacklists().get(fdef.getName)
    if (blacklist == null) {
      throw new IllegalArgumentException("Column: " + fdef.getName + " does not have any filter")
    } else {
      fdef.getType match {
        case Type.List | Type.JSON =>
          udf[Array[String], Array[String]](values => {
            values.filter((v: String) => !blacklist.test(v))
          })
        case _ =>
          udf[String, String](v => {
            if (blacklist.test(v)) ""
            else v
          })
      }
    }
  }
  val generateClustersWithCollect: (Dataset[Row] => Dataset[Row]) = df_with_filters => {
    var df_with_clustering_keys: Dataset[Row] = null
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
@ -5,12 +5,12 @@ import eu.dnetlib.pace.common.AbstractPaceFunctions
 import eu.dnetlib.pace.config.{DedupConfig, Type}
 import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
 import org.apache.commons.lang3.StringUtils
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
 import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
 import org.apache.spark.sql.{Dataset, Row}
 import java.util.Locale
 import java.util.function.Predicate
 import java.util.regex.Pattern
 import scala.collection.JavaConverters._
@ -29,8 +29,20 @@ case class SparkModel(conf: DedupConfig) {
    identifier.setName(identifierFieldName)
    identifier.setType(Type.String)
    // create fields for blacklist
    val filtered = conf.getPace.getModel.asScala.flatMap(fdef => {
      if (conf.blacklists().containsKey(fdef.getName)) {
        val fdef_filtered = fdef.clone()
        fdef_filtered.setName(fdef.getName + "_filtered")
        Seq(fdef, fdef_filtered)
      }
      else {
        Seq(fdef)
      }
    })
    // Construct a Spark StructType representing the schema of the model
-    (Seq(identifier) ++ conf.getPace.getModel.asScala)
+    (Seq(identifier) ++ filtered)
      .foldLeft(
        new StructType()
      )((resType, fieldDef) => {
@ -44,7 +56,6 @@ case class SparkModel(conf: DedupConfig) {
        })
      })
  }
  val identityFieldPosition: Int = schema.fieldIndex(identifierFieldName)
@ -52,7 +63,8 @@ case class SparkModel(conf: DedupConfig) {
  val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
  val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
-    df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
+    df
      .map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
  }
  def rowFromJson(json: String): Row = {
@ -64,41 +76,63 @@ case class SparkModel(conf: DedupConfig) {
    schema.fieldNames.zipWithIndex.foldLeft(values) {
      case ((res, (fname, index))) =>
-        val fdef = conf.getPace.getModelMap.get(fname)
+
        val fdef = conf.getPace.getModelMap.get(fname.split("_filtered")(0))
        if (fdef != null) {
-          res(index) = fdef.getType match {
+          if (!fname.contains("_filtered")) { //process fields with no blacklist
-            case Type.String | Type.Int =>
+            res(index) = fdef.getType match {
-              MapDocumentUtil.truncateValue(
+              case Type.String | Type.Int =>
-                MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
+                MapDocumentUtil.truncateValue(
-                fdef.getLength
+                  MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
-              )
+                  fdef.getLength
                )
-            case Type.URL =>
+              case Type.URL =>
-              var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext)
+                var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext)
-              if (!URL_REGEX.matcher(uv).matches)
+                if (!URL_REGEX.matcher(uv).matches)
-                uv = ""
+                  uv = ""
-              uv
+                uv
-            case Type.List | Type.JSON =>
+              case Type.List | Type.JSON =>
-              MapDocumentUtil.truncateList(
+                MapDocumentUtil.truncateList(
-                MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
+                  MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
-                fdef.getSize
+                  fdef.getSize
-              ).asScala
+                ).asScala
-            case Type.StringConcat =>
+              case Type.StringConcat =>
-              val jpaths = CONCAT_REGEX.split(fdef.getPath)
+                val jpaths = CONCAT_REGEX.split(fdef.getPath)
-              MapDocumentUtil.truncateValue(
+                MapDocumentUtil.truncateValue(
-                jpaths
+                  jpaths
-                  .map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext))
+                    .map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext))
-                  .mkString(" "),
+                    .mkString(" "),
-                fdef.getLength
+                  fdef.getLength
-              )
+                )
-            case Type.DoubleArray =>
+              case Type.DoubleArray =>
-              MapDocumentUtil.getJPathArray(fdef.getPath, json)
+                MapDocumentUtil.getJPathArray(fdef.getPath, json)
            }
          }
          else { //process fields with blacklist
            val blacklist: Predicate[String] = conf.blacklists().get(fdef.getName)
            res(index) = fdef.getType match {
              case Type.List | Type.JSON =>
                MapDocumentUtil.truncateList(
                  MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
                  fdef.getSize
                ).asScala.filter((v: String) => !blacklist.test(v))
              case _ =>
                val value: String = MapDocumentUtil.truncateValue(
                  MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
                  fdef.getLength
                )
                if (blacklist.test(value)) "" else value
            }
          }
          val filter = fdef.getFilter
@ -125,13 +159,12 @@ case class SparkModel(conf: DedupConfig) {
          }
          if (StringUtils.isNotBlank(fdef.getInfer)) {
-            val inferFrom : String = if (StringUtils.isNotBlank(fdef.getInferenceFrom)) fdef.getInferenceFrom else fdef.getPath
+            val inferFrom: String = if (StringUtils.isNotBlank(fdef.getInferenceFrom)) fdef.getInferenceFrom else fdef.getPath
            res(index) = res(index) match {
              case x: Seq[String] => x.map(inference(_, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer))
              case _ => inference(res(index).toString, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer)
            }
          }
        }
        res
@ -139,6 +172,7 @@ case class SparkModel(conf: DedupConfig) {
    }
    new GenericRowWithSchema(values, schema)
  }
  def clean(value: String, cleantype: String) : String = {
--- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java
+++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java
@ -227,4 +227,17 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
 		System.out.println(cf.apply(conf, Lists.newArrayList(s)));
 	}
 	@Test
 	public void testNumAuthorsTitleSuffixPrefixChain() {
 		final ClusteringFunction cf = new NumAuthorsTitleSuffixPrefixChain(params);
 		params.put("mod", 10);
 		final String title = "PARP-2 Regulates SIRT1 Expression and Whole-Body Energy Expenditure";
 		final String num_authors = "10";
 		System.out.println("title = " + title);
 		System.out.println("num_authors = " + num_authors);
 		System.out.println(cf.apply(conf, Lists.newArrayList(num_authors, title)));
 	}
 }
--- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java
+++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java
@ -1,8 +1,7 @@
 package eu.dnetlib.pace.common;
-import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.*;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import org.junit.jupiter.api.*;
@ -54,8 +53,17 @@ public class PaceFunctionTest extends AbstractPaceFunctions {
 		System.out.println("Fixed aliases  : " + fixAliases(TEST_STRING));
 	}
 	@Test()
 	public void countryInferenceTest_NPE() {
 		assertThrows(
 			NullPointerException.class,
 			() -> countryInference("UNKNOWN", null),
 			"Expected countryInference() to throw an NPE");
 	}
 	@Test
 	public void countryInferenceTest() {
 		assertEquals("UNKNOWN", countryInference("UNKNOWN", ""));
 		assertEquals("IT", countryInference("UNKNOWN", "Università di Bologna"));
 		assertEquals("UK", countryInference("UK", "Università di Bologna"));
 		assertEquals("IT", countryInference("UNKNOWN", "Universiteé de Naples"));
--- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java
+++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java
@ -367,7 +367,18 @@ public class ComparatorTest extends AbstractPaceTest {
 		result = dateRange.distance("invalid date", "2021-05-02", conf);
 		assertEquals(-1.0, result);
 	}
 	@Test
 	public void titleVersionMatchTest() {
 		TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
 		double result = titleVersionMatch
 			.compare(
 				"parp 2 regulates sirt 1 expression and whole body energy expenditure",
 				"parp 2 regulates sirt 1 expression and whole body energy expenditure", conf);
 		assertEquals(1.0, result);
 	}
 }
--- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
+++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
@ -11,7 +11,6 @@ import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 import eu.dnetlib.pace.model.Person;
 import jdk.nashorn.internal.ir.annotations.Ignore;
 public class UtilTest {
--- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java
+++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java
@ -151,12 +151,17 @@ public class PromoteActionPayloadForGraphTableJob {
 		SparkSession spark, String path, Class<G> rowClazz) {
 		logger.info("Reading graph table from path: {}", path);
-		return spark
+		if (HdfsSupport.exists(path, spark.sparkContext().hadoopConfiguration())) {
-			.read()
+			return spark
-			.textFile(path)
+				.read()
-			.map(
+				.textFile(path)
-				(MapFunction<String, G>) value -> OBJECT_MAPPER.readValue(value, rowClazz),
+				.map(
-				Encoders.bean(rowClazz));
+					(MapFunction<String, G>) value -> OBJECT_MAPPER.readValue(value, rowClazz),
 					Encoders.bean(rowClazz));
 		} else {
 			logger.info("Found empty graph table from path: {}", path);
 			return spark.emptyDataset(Encoders.bean(rowClazz));
 		}
 	}
 	private static <A extends Oaf> Dataset<A> readActionPayload(
@ -223,7 +228,7 @@ public class PromoteActionPayloadForGraphTableJob {
 				rowClazz,
 				actionPayloadClazz);
-		if (shouldGroupById) {
+		if (Boolean.TRUE.equals(shouldGroupById)) {
 			return PromoteActionPayloadFunctions
 				.groupGraphTableByIdAndMerge(
 					joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz);
@ -250,6 +255,8 @@ public class PromoteActionPayloadForGraphTableJob {
 				return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Relation());
 			case "eu.dnetlib.dhp.schema.oaf.Software":
 				return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Software());
 			case "eu.dnetlib.dhp.schema.oaf.Person":
 				return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Person());
 			default:
 				throw new RuntimeException("unknown class: " + clazz.getCanonicalName());
 		}
--- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java
+++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java
@ -50,7 +50,7 @@ public class PromoteActionPayloadFunctions {
 		PromoteAction.Strategy promoteActionStrategy,
 		Class<G> rowClazz,
 		Class<A> actionPayloadClazz) {
-		if (!isSubClass(rowClazz, actionPayloadClazz)) {
+		if (Boolean.FALSE.equals(isSubClass(rowClazz, actionPayloadClazz))) {
 			throw new RuntimeException(
 				"action payload type must be the same or be a super type of table row type");
 		}
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/import.txt
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/import.txt
@ -7,3 +7,4 @@ promote_action_payload_for_project_table classpath eu/dnetlib/dhp/actionmanager/
 promote_action_payload_for_publication_table classpath eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app
 promote_action_payload_for_relation_table classpath eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app
 promote_action_payload_for_software_table classpath eu/dnetlib/dhp/actionmanager/wf/software/oozie_app
 promote_action_payload_for_person_table classpath eu/dnetlib/dhp/actionmanager/wf/person/oozie_app
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml
@ -148,6 +148,7 @@
        <path start="PromoteActionPayloadForPublicationTable"/>
        <path start="PromoteActionPayloadForRelationTable"/>
        <path start="PromoteActionPayloadForSoftwareTable"/>
        <path start="PromoteActionPayloadForPersonTable"/>
    </fork>
    <action name="PromoteActionPayloadForDatasetTable">
@ -270,6 +271,21 @@
        <error to="Kill"/>
    </action>
    <action name="PromoteActionPayloadForPersonTable">
        <sub-workflow>
            <app-path>${wf:appPath()}/promote_action_payload_for_person_table</app-path>
            <propagate-configuration/>
            <configuration>
                <property>
                    <name>inputActionPayloadRootPath</name>
                    <value>${workingDir}/action_payload_by_type</value>
                </property>
            </configuration>
        </sub-workflow>
        <ok to="JoinPromote"/>
        <error to="Kill"/>
    </action>
    <join name="JoinPromote" to="End"/>
    <end name="End"/>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/person/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/person/oozie_app/workflow.xml
@ -0,0 +1,129 @@
 <workflow-app name="promote_action_payload_for_person_table" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>activePromotePersonActionPayload</name>
            <description>when true will promote actions with eu.dnetlib.dhp.schema.oaf.Person payload</description>
        </property>
        <property>
            <name>inputGraphRootPath</name>
            <description>root location of input materialized graph</description>
        </property>
        <property>
            <name>inputActionPayloadRootPath</name>
            <description>root location of action payloads to promote</description>
        </property>
        <property>
            <name>outputGraphRootPath</name>
            <description>root location for output materialized graph</description>
        </property>
        <property>
            <name>mergeAndGetStrategy</name>
            <description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
        </property>
        <property>
            <name>sparkExecutorMemory</name>
            <description>memory for individual executor</description>
        </property>
        <property>
            <name>sparkExecutorCores</name>
            <description>number of cores used by single executor</description>
        </property>
        <property>
            <name>oozieActionShareLibForSpark2</name>
            <description>oozie action sharelib for spark 2.*</description>
        </property>
        <property>
            <name>spark2ExtraListeners</name>
            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
            <description>spark 2.* extra listeners classname</description>
        </property>
        <property>
            <name>spark2SqlQueryExecutionListeners</name>
            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
            <description>spark 2.* sql query execution listeners classname</description>
        </property>
        <property>
            <name>spark2YarnHistoryServerAddress</name>
            <description>spark 2.* yarn history server address</description>
        </property>
        <property>
            <name>spark2EventLogDir</name>
            <description>spark 2.* event log dir location</description>
        </property>
    </parameters>
    <global>
        <job-tracker>${jobTracker}</job-tracker>
        <name-node>${nameNode}</name-node>
        <configuration>
            <property>
                <name>oozie.action.sharelib.for.spark</name>
                <value>${oozieActionShareLibForSpark2}</value>
            </property>
        </configuration>
    </global>
    <start to="DecisionPromotePersonActionPayload"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <decision name="DecisionPromotePersonActionPayload">
        <switch>
            <case to="PromotePersonActionPayloadForPersonTable">
                ${(activePromotePersonActionPayload eq "true") and
                (fs:exists(concat(concat(concat(concat(wf:conf('nameNode'),'/'),wf:conf('inputActionPayloadRootPath')),'/'),'clazz=eu.dnetlib.dhp.schema.oaf.Person')) eq "true")}
            </case>
            <default to="SkipPromotePersonActionPayloadForPersonTable"/>
        </switch>
    </decision>
    <action name="PromotePersonActionPayloadForPersonTable">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>PromotePersonActionPayloadForPersonTable</name>
            <class>eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJob</class>
            <jar>dhp-actionmanager-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/person</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
            <arg>--inputActionPayloadPath</arg><arg>${inputActionPayloadRootPath}/clazz=eu.dnetlib.dhp.schema.oaf.Person</arg>
            <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
            <arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/person</arg>
            <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
            <arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <action name="SkipPromotePersonActionPayloadForPersonTable">
        <distcp xmlns="uri:oozie:distcp-action:0.2">
            <prepare>
                <delete path="${outputGraphRootPath}/person"/>
            </prepare>
            <arg>-pb</arg>
            <arg>${inputGraphRootPath}/person</arg>
            <arg>${outputGraphRootPath}/person</arg>
        </distcp>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
@ -34,7 +34,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
 import scala.Tuple2;
 /**
- * Creates action sets for Crossref affiliation relations inferred by BIP!
+ * Creates action sets for Crossref affiliation relations inferred by OpenAIRE
 */
 public class PrepareAffiliationRelations implements Serializable {
@ -104,22 +104,22 @@ public class PrepareAffiliationRelations implements Serializable {
 			.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
 		JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelationsNewModel(
-			spark, crossrefInputPath, collectedfromOpenAIRE);
+			spark, crossrefInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":crossref");
 		JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
-			spark, pubmedInputPath, collectedfromOpenAIRE);
+			spark, pubmedInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":pubmed");
 		JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelationsNewModel(
-			spark, openapcInputPath, collectedfromOpenAIRE);
+			spark, openapcInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":openapc");
-		JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
+		JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelationsNewModel(
-			spark, dataciteInputPath, collectedfromOpenAIRE);
+			spark, dataciteInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":datacite");
-		JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
+		JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelationsNewModel(
-			spark, webcrawlInputPath, collectedfromOpenAIRE);
+			spark, webcrawlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":rawaff");
-		JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisher(
+		JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisherNewModel(
-			spark, publisherlInputPath, collectedfromOpenAIRE);
+			spark, publisherlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":webcrawl");
 		crossrefRelations
 			.union(pubmedRelations)
@ -133,7 +133,8 @@ public class PrepareAffiliationRelations implements Serializable {
 	private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisherNewModel(SparkSession spark,
 		String inputPath,
-		List<KeyValue> collectedfrom) {
+		List<KeyValue> collectedfrom,
 		String dataprovenance) {
 		Dataset<Row> df = spark
 			.read()
@ -142,12 +143,13 @@ public class PrepareAffiliationRelations implements Serializable {
 			.json(inputPath)
 			.where("DOI is not null");
-		return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
+		return getTextTextJavaPairRDDNew(
 			collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"), dataprovenance);
 	}
 	private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath,
-		List<KeyValue> collectedfrom) {
+		List<KeyValue> collectedfrom, String dataprovenance) {
 		Dataset<Row> df = spark
 			.read()
@ -155,13 +157,14 @@ public class PrepareAffiliationRelations implements Serializable {
 			.json(inputPath)
 			.where("DOI is not null");
-		return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
+		return getTextTextJavaPairRDD(
 			collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"), dataprovenance);
 	}
 	private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelations(SparkSession spark,
 		String inputPath,
-		List<KeyValue> collectedfrom) {
+		List<KeyValue> collectedfrom, String dataprovenance) {
 		// load and parse affiliation relations from HDFS
 		Dataset<Row> df = spark
@ -170,12 +173,12 @@ public class PrepareAffiliationRelations implements Serializable {
 			.json(inputPath)
 			.where("DOI is not null");
-		return getTextTextJavaPairRDD(collectedfrom, df);
+		return getTextTextJavaPairRDD(collectedfrom, df, dataprovenance);
 	}
 	private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelationsNewModel(SparkSession spark,
 		String inputPath,
-		List<KeyValue> collectedfrom) {
+		List<KeyValue> collectedfrom, String dataprovenance) {
 		// load and parse affiliation relations from HDFS
 		Dataset<Row> df = spark
 			.read()
@ -184,10 +187,11 @@ public class PrepareAffiliationRelations implements Serializable {
 			.json(inputPath)
 			.where("DOI is not null");
-		return getTextTextJavaPairRDDNew(collectedfrom, df);
+		return getTextTextJavaPairRDDNew(collectedfrom, df, dataprovenance);
 	}
-	private static JavaPairRDD<Text, Text> getTextTextJavaPairRDD(List<KeyValue> collectedfrom, Dataset<Row> df) {
+	private static JavaPairRDD<Text, Text> getTextTextJavaPairRDD(List<KeyValue> collectedfrom, Dataset<Row> df,
 		String dataprovenance) {
 		// unroll nested arrays
 		df = df
 			.withColumn("matching", functions.explode(new Column("Matchings")))
@ -219,7 +223,7 @@ public class PrepareAffiliationRelations implements Serializable {
 				DataInfo dataInfo = OafMapperUtils
 					.dataInfo(
 						false,
-						BIP_INFERENCE_PROVENANCE,
+						dataprovenance,
 						true,
 						false,
 						qualifier,
@ -235,7 +239,8 @@ public class PrepareAffiliationRelations implements Serializable {
 					new Text(OBJECT_MAPPER.writeValueAsString(aa))));
 	}
-	private static JavaPairRDD<Text, Text> getTextTextJavaPairRDDNew(List<KeyValue> collectedfrom, Dataset<Row> df) {
+	private static JavaPairRDD<Text, Text> getTextTextJavaPairRDDNew(List<KeyValue> collectedfrom, Dataset<Row> df,
 		String dataprovenance) {
 		// unroll nested arrays
 		df = df
 			.withColumn("matching", functions.explode(new Column("Matchings")))
@ -276,7 +281,7 @@ public class PrepareAffiliationRelations implements Serializable {
 				DataInfo dataInfo = OafMapperUtils
 					.dataInfo(
 						false,
-						BIP_INFERENCE_PROVENANCE,
+						dataprovenance,
 						true,
 						false,
 						qualifier,
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java
@ -2,21 +2,31 @@
 package eu.dnetlib.dhp.actionmanager.personentity;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import static org.apache.spark.sql.functions.*;
 import java.io.BufferedWriter;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
 import java.io.Serializable;
 import java.nio.charset.StandardCharsets;
 import java.sql.ResultSet;
 import java.sql.SQLException;
 import java.util.*;
 import java.util.stream.Collectors;
 import org.apache.commons.cli.ParseException;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.BZip2Codec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.*;
 import org.apache.spark.sql.*;
 import org.apache.spark.sql.Dataset;
 import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -28,13 +38,14 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.collection.orcid.model.Author;
 import eu.dnetlib.dhp.collection.orcid.model.Employment;
 import eu.dnetlib.dhp.collection.orcid.model.Work;
 import eu.dnetlib.dhp.common.DbClient;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.common.person.CoAuthorshipIterator;
 import eu.dnetlib.dhp.common.person.Coauthors;
 import eu.dnetlib.dhp.schema.action.AtomicAction;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
-import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.schema.oaf.Person;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
 import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
 import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
@ -44,7 +55,7 @@ import scala.Tuple2;
 public class ExtractPerson implements Serializable {
 	private static final Logger log = LoggerFactory.getLogger(ExtractPerson.class);
-
+	private static final String QUERY = "SELECT * FROM project_person WHERE pid_type = 'ORCID'";
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	private static final String OPENAIRE_PREFIX = "openaire____";
 	private static final String SEPARATOR = "::";
@ -58,9 +69,48 @@ public class ExtractPerson implements Serializable {
 	private static final String PMCID_PREFIX = "50|pmcid_______::";
 	private static final String ROR_PREFIX = "20|ror_________::";
-	private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class) + "|orcid_______";
+	private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class)
 		+ IdentifierFactory.ID_PREFIX_SEPARATOR + ModelConstants.ORCID + "_______";
 	private static final String PROJECT_ID_PREFIX = ModelSupport.getIdPrefix(Project.class)
 		+ IdentifierFactory.ID_PREFIX_SEPARATOR;
 	public static final String ORCID_AUTHORS_CLASSID = "sysimport:crosswalk:orcid";
 	public static final String ORCID_AUTHORS_CLASSNAME = "Imported from ORCID";
 	public static final String FUNDER_AUTHORS_CLASSID = "sysimport:crosswalk:funderdatabase";
 	public static final String FUNDER_AUTHORS_CLASSNAME = "Imported from Funder Database";
 	public static final String OPENAIRE_DATASOURCE_ID = "10|infrastruct_::f66f1bd369679b5b077dcdf006089556";
 	public static final String OPENAIRE_DATASOURCE_NAME = "OpenAIRE";
 	public static List<KeyValue> collectedfromOpenAIRE = OafMapperUtils
 		.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
 	public static final DataInfo ORCIDDATAINFO = OafMapperUtils
 		.dataInfo(
 			false,
 			null,
 			false,
 			false,
 			OafMapperUtils
 				.qualifier(
 					ORCID_AUTHORS_CLASSID,
 					ORCID_AUTHORS_CLASSNAME,
 					ModelConstants.DNET_PROVENANCE_ACTIONS,
 					ModelConstants.DNET_PROVENANCE_ACTIONS),
 			"0.91");
 	public static final DataInfo FUNDERDATAINFO = OafMapperUtils
 		.dataInfo(
 			false,
 			null,
 			false,
 			false,
 			OafMapperUtils
 				.qualifier(
 					FUNDER_AUTHORS_CLASSID,
 					FUNDER_AUTHORS_CLASSNAME,
 					ModelConstants.DNET_PROVENANCE_ACTIONS,
 					ModelConstants.DNET_PROVENANCE_ACTIONS),
 			"0.91");
 	public static void main(final String[] args) throws IOException, ParseException {
@ -91,19 +141,130 @@ public class ExtractPerson implements Serializable {
 		final String workingDir = parser.get("workingDir");
 		log.info("workingDir {}", workingDir);
 		final String dbUrl = parser.get("postgresUrl");
 		final String dbUser = parser.get("postgresUser");
 		final String dbPassword = parser.get("postgresPassword");
 		final String hdfsNameNode = parser.get("hdfsNameNode");
 		SparkConf conf = new SparkConf();
 		runWithSparkSession(
 			conf,
 			isSparkSessionManaged,
 			spark -> {
 				HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
-				createActionSet(spark, inputPath, outputPath, workingDir);
+				extractInfoForActionSetFromORCID(spark, inputPath, workingDir);
 				extractInfoForActionSetFromProjects(
 					spark, inputPath, workingDir, dbUrl, dbUser, dbPassword, workingDir + "/project", hdfsNameNode);
 				createActionSet(spark, outputPath, workingDir);
 			});
 	}
-	private static void createActionSet(SparkSession spark, String inputPath, String outputPath, String workingDir) {
+	private static void extractInfoForActionSetFromProjects(SparkSession spark, String inputPath, String workingDir,
 		String dbUrl, String dbUser, String dbPassword, String hdfsPath, String hdfsNameNode) throws IOException {
 		Configuration conf = new Configuration();
 		conf.set("fs.defaultFS", hdfsNameNode);
 		FileSystem fileSystem = FileSystem.get(conf);
 		Path hdfsWritePath = new Path(hdfsPath);
 		FSDataOutputStream fos = fileSystem.create(hdfsWritePath);
 		try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
 			try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8))) {
 				dbClient.processResults(QUERY, rs -> writeRelation(getRelationWithProject(rs), writer));
 			}
 		} catch (IOException e) {
 			throw new RuntimeException(e);
 		}
 	}
 	public static Relation getRelationWithProject(ResultSet rs) {
 		try {
 			return getProjectRelation(
 				rs.getString("project"), rs.getString("pid"),
 				rs.getString("role"));
 		} catch (final SQLException e) {
 			throw new RuntimeException(e);
 		}
 	}
 	private static Relation getProjectRelation(String project, String orcid, String role) {
 		String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid);
 		String target = PROJECT_ID_PREFIX + StringUtils.substringBefore(project, "::") + "::"
 			+ IdentifierFactory.md5(StringUtils.substringAfter(project, "::"));
 		List<KeyValue> properties = new ArrayList<>();
 		Relation relation = OafMapperUtils
 			.getRelation(
 				source, target, ModelConstants.PROJECT_PERSON_RELTYPE, ModelConstants.PROJECT_PERSON_SUBRELTYPE,
 				ModelConstants.PROJECT_PERSON_PARTICIPATES,
 				collectedfromOpenAIRE,
 				FUNDERDATAINFO,
 				null);
 		relation.setValidated(true);
 		if (StringUtil.isNotBlank(role)) {
 			KeyValue kv = new KeyValue();
 			kv.setKey("role");
 			kv.setValue(role);
 			properties.add(kv);
 		}
 		if (!properties.isEmpty())
 			relation.setProperties(properties);
 		return relation;
 	}
 	protected static void writeRelation(final Relation relation, BufferedWriter writer) {
 		try {
 			writer.write(OBJECT_MAPPER.writeValueAsString(relation));
 			writer.newLine();
 		} catch (final IOException e) {
 			throw new RuntimeException(e);
 		}
 	}
 	private static void createActionSet(SparkSession spark, String outputPath, String workingDir) {
 		Dataset<Person> people;
 		people = spark
 			.read()
 			.textFile(workingDir + "/people")
 			.map(
 				(MapFunction<String, Person>) value -> OBJECT_MAPPER
 					.readValue(value, Person.class),
 				Encoders.bean(Person.class));
 		people
 			.toJavaRDD()
 			.map(p -> new AtomicAction(p.getClass(), p))
 			.union(
 				getRelations(spark, workingDir + "/authorship").toJavaRDD().map(r -> new AtomicAction(r.getClass(), r)))
 			.union(
 				getRelations(spark, workingDir + "/coauthorship")
 					.toJavaRDD()
 					.map(r -> new AtomicAction(r.getClass(), r)))
 			.union(
 				getRelations(spark, workingDir + "/affiliation")
 					.toJavaRDD()
 					.map(r -> new AtomicAction(r.getClass(), r)))
 			.union(
 				getRelations(spark, workingDir + "/project")
 					.toJavaRDD()
 					.map(r -> new AtomicAction(r.getClass(), r)))
 			.mapToPair(
 				aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
 					new Text(OBJECT_MAPPER.writeValueAsString(aa))))
 			.saveAsHadoopFile(
 				outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
 	}
 	private static void extractInfoForActionSetFromORCID(SparkSession spark, String inputPath, String workingDir) {
 		Dataset<Author> authors = spark
 			.read()
 			.parquet(inputPath + "Authors")
@ -129,18 +290,13 @@ public class ExtractPerson implements Serializable {
 			.parquet(inputPath + "Employments")
 			.as(Encoders.bean(Employment.class));
 		Dataset<Author> peopleToMap = authors
 			.joinWith(works, authors.col("orcid").equalTo(works.col("orcid")))
 			.map((MapFunction<Tuple2<Author, Work>, Author>) t2 -> t2._1(), Encoders.bean(Author.class))
 			.groupByKey((MapFunction<Author, String>) a -> a.getOrcid(), Encoders.STRING())
 			.mapGroups((MapGroupsFunction<String, Author, Author>) (k, it) -> it.next(), Encoders.bean(Author.class));
 		Dataset<Employment> employment = employmentDataset
-			.joinWith(peopleToMap, employmentDataset.col("orcid").equalTo(peopleToMap.col("orcid")))
+			.joinWith(authors, employmentDataset.col("orcid").equalTo(authors.col("orcid")))
 			.map((MapFunction<Tuple2<Employment, Author>, Employment>) t2 -> t2._1(), Encoders.bean(Employment.class));
-		Dataset<Person> people;
+		// Mapping all the orcid profiles even if the profile has no visible works
-		peopleToMap.map((MapFunction<Author, Person>) op -> {
+
 		authors.map((MapFunction<Author, Person>) op -> {
 			Person person = new Person();
 			person.setId(DHPUtils.generateIdentifier(op.getOrcid(), PERSON_PREFIX));
 			person
@ -190,9 +346,19 @@ public class ExtractPerson implements Serializable {
 					OafMapperUtils
 						.structuredProperty(
 							op.getOrcid(), ModelConstants.ORCID, ModelConstants.ORCID_CLASSNAME,
-							ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, null));
+							ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES,
 								OafMapperUtils.dataInfo(false,
 										null,
 										false,
 										false,
 										OafMapperUtils.qualifier(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
 												ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
 												ModelConstants.DNET_PID_TYPES,
 												ModelConstants.DNET_PID_TYPES),
 								"0.91")));
 			person.setDateofcollection(op.getLastModifiedDate());
 			person.setOriginalId(Arrays.asList(op.getOrcid()));
 			person.setDataInfo(ORCIDDATAINFO);
 			return person;
 		}, Encoders.bean(Person.class))
 			.write()
@ -246,34 +412,6 @@ public class ExtractPerson implements Serializable {
 			.option("compression", "gzip")
 			.mode(SaveMode.Overwrite)
 			.json(workingDir + "/affiliation");
 		people = spark
 			.read()
 			.textFile(workingDir + "/people")
 			.map(
 				(MapFunction<String, Person>) value -> OBJECT_MAPPER
 					.readValue(value, Person.class),
 				Encoders.bean(Person.class));
 		people.show(false);
 		people
 			.toJavaRDD()
 			.map(p -> new AtomicAction(p.getClass(), p))
 			.union(
 				getRelations(spark, workingDir + "/authorship").toJavaRDD().map(r -> new AtomicAction(r.getClass(), r)))
 			.union(
 				getRelations(spark, workingDir + "/coauthorship")
 					.toJavaRDD()
 					.map(r -> new AtomicAction(r.getClass(), r)))
 			.union(
 				getRelations(spark, workingDir + "/affiliation")
 					.toJavaRDD()
 					.map(r -> new AtomicAction(r.getClass(), r)))
 			.mapToPair(
 				aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
 					new Text(OBJECT_MAPPER.writeValueAsString(aa))))
 			.saveAsHadoopFile(
 				outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
 	}
 	private static Dataset<Relation> getRelations(SparkSession spark, String path) {
@ -307,15 +445,9 @@ public class ExtractPerson implements Serializable {
 				source, target, ModelConstants.ORG_PERSON_RELTYPE, ModelConstants.ORG_PERSON_SUBRELTYPE,
 				ModelConstants.ORG_PERSON_PARTICIPATES,
 				Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
-				OafMapperUtils
+				ORCIDDATAINFO,
 					.dataInfo(
 						false, null, false, false,
 						OafMapperUtils
 							.qualifier(
 								ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS,
 								ModelConstants.DNET_PROVENANCE_ACTIONS),
 						"0.91"),
 				null);
 		relation.setValidated(true);
 		if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtil.isNotBlank(row.getStartDate())) {
 			KeyValue kv = new KeyValue();
@ -336,45 +468,6 @@ public class ExtractPerson implements Serializable {
 	}
 	private static Collection<? extends Relation> getCoAuthorshipRelations(String orcid1, String orcid2) {
 		String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid1);
 		String target = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid2);
 		return Arrays
 			.asList(
 				OafMapperUtils
 					.getRelation(
 						source, target, ModelConstants.PERSON_PERSON_RELTYPE,
 						ModelConstants.PERSON_PERSON_SUBRELTYPE,
 						ModelConstants.PERSON_PERSON_HASCOAUTHORED,
 						Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
 						OafMapperUtils
 							.dataInfo(
 								false, null, false, false,
 								OafMapperUtils
 									.qualifier(
 										ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME,
 										ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
 								"0.91"),
 						null),
 				OafMapperUtils
 					.getRelation(
 						target, source, ModelConstants.PERSON_PERSON_RELTYPE,
 						ModelConstants.PERSON_PERSON_SUBRELTYPE,
 						ModelConstants.PERSON_PERSON_HASCOAUTHORED,
 						Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
 						OafMapperUtils
 							.dataInfo(
 								false, null, false, false,
 								OafMapperUtils
 									.qualifier(
 										ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME,
 										ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
 								"0.91"),
 						null));
 	}
 	private static @NotNull Iterator<Relation> getAuthorshipRelationIterator(Work w) {
 		if (Optional.ofNullable(w.getPids()).isPresent())
@ -417,21 +510,15 @@ public class ExtractPerson implements Serializable {
 			default:
 				return null;
 		}
-
+		Relation relation = OafMapperUtils
 		return OafMapperUtils
 			.getRelation(
 				source, target, ModelConstants.RESULT_PERSON_RELTYPE,
 				ModelConstants.RESULT_PERSON_SUBRELTYPE,
 				ModelConstants.RESULT_PERSON_HASAUTHORED,
 				Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
-				OafMapperUtils
+				ORCIDDATAINFO,
 					.dataInfo(
 						false, null, false, false,
 						OafMapperUtils
 							.qualifier(
 								ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS,
 								ModelConstants.DNET_PROVENANCE_ACTIONS),
 						"0.91"),
 				null);
 		relation.setValidated(true);
 		return relation;
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties
@ -31,9 +31,11 @@ spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListen
 # The following is needed as a property of a workflow
 oozie.wf.application.path=${oozieTopWfApplicationPath}
-crossrefInputPath=/data/bip-affiliations/crossref-data.json
+crossrefInputPath=/data/openaire-affiliations/crossref-data.json
-pubmedInputPath=/data/bip-affiliations/pubmed-data.json
+pubmedInputPath=/data/openaire-affiliations/pubmed-data-v4.json
-openapcInputPath=/data/bip-affiliations/openapc-data.json
+openapcInputPath=/data/openaire-affiliations/openapc-data.json
-dataciteInputPath=/data/bip-affiliations/datacite-data.json
+dataciteInputPath=/data/openaire-affiliations/datacite-data.json
 webCrawlInputPath=/data/openaire-affiliations/webCrawl
 publisherInputPath=/data/openaire-affiliations/publishers
-outputPath=/tmp/crossref-affiliations-output-v5
+outputPath=/tmp/affRoAS
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml
@ -1,4 +1,4 @@
-<workflow-app name="BipAffiliations" xmlns="uri:oozie:workflow:0.5">
+<workflow-app name="OpenAIREAffiliations" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
@ -21,6 +21,10 @@
            <name>webCrawlInputPath</name>
            <description>the path where to find the inferred affiliation relations from webCrawl</description>
        </property>
        <property>
            <name>publisherInputPath</name>
            <description>the path where to find the inferred affiliation relations from publisher websites</description>
        </property>
        <property>
            <name>outputPath</name>
            <description>the path where to store the actionset</description>
@ -99,7 +103,7 @@
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
-            <name>Produces the atomic action with the inferred by BIP! affiliation relations (from Crossref and Pubmed)</name>
+            <name>Produces the atomic action with the inferred by OpenAIRE affiliation relations</name>
            <class>eu.dnetlib.dhp.actionmanager.bipaffiliations.PrepareAffiliationRelations</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
@ -117,6 +121,7 @@
            <arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
            <arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg>
            <arg>--webCrawlInputPath</arg><arg>${webCrawlInputPath}</arg>
            <arg>--publisherInputPath</arg><arg>${publisherInputPath}</arg>
            <arg>--outputPath</arg><arg>${outputPath}</arg>
        </spark>
        <ok to="End"/>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/as_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/as_parameters.json
@ -21,5 +21,30 @@
  "paramLongName": "workingDir",
  "paramDescription": "the hdfs name node",
  "paramRequired": false
 },
  {
    "paramName": "pu",
    "paramLongName": "postgresUrl",
    "paramDescription": "the hdfs name node",
    "paramRequired": false
  },
  {
    "paramName": "ps",
    "paramLongName": "postgresUser",
    "paramDescription": "the hdfs name node",
    "paramRequired": false
  },
  {
  "paramName": "pp",
  "paramLongName": "postgresPassword",
  "paramDescription": "the hdfs name node",
  "paramRequired": false
 },{
  "paramName": "nn",
  "paramLongName": "hdfsNameNode",
  "paramDescription": "the hdfs name node",
  "paramRequired": false
 }
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/job.properties
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/job.properties
@ -1,2 +1,5 @@
 inputPath=/data/orcid_2023/tables/
-outputPath=/user/miriam.baglioni/peopleAS
+outputPath=/user/miriam.baglioni/peopleAS
 postgresUrl=jdbc:postgresql://beta.services.openaire.eu:5432/dnet_openaireplus
 postgresUser=dnet
 postgresPassword=dnetPwd
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/oozie_app/workflow.xml
@ -9,6 +9,18 @@
            <name>outputPath</name>
            <description>the path where to store the actionset</description>
        </property>
        <property>
            <name>postgresUrl</name>
            <description>the path where to store the actionset</description>
        </property>
        <property>
            <name>postgresUser</name>
            <description>the path where to store the actionset</description>
        </property>
        <property>
            <name>postgresPassword</name>
            <description>the path where to store the actionset</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
@ -102,6 +114,10 @@
            <arg>--inputPath</arg><arg>${inputPath}</arg>
            <arg>--outputPath</arg><arg>${outputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
            <arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
            <arg>--postgresUrl</arg><arg>${postgresUrl}</arg>
            <arg>--postgresUser</arg><arg>${postgresUser}</arg>
            <arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/transformativeagreement/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/transformativeagreement/oozie_app/workflow.xml
@ -24,7 +24,7 @@
    <decision name="resume_from">
        <switch>
-            <case to="download">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
+            <case to="reset_workingDir">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
            <default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed -->
        </switch>
    </decision>
@ -33,6 +33,14 @@
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="reset_workingDir">
        <fs>
            <delete path="${workingDir}"/>
            <mkdir path="${workingDir}"/>
        </fs>
        <ok to="download"/>
        <error to="Kill"/>
    </action>
    <action name="download">
        <shell xmlns="uri:oozie:shell-action:0.2">
            <job-tracker>${jobTracker}</job-tracker>
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
@ -14,7 +14,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.{
  PidType
 }
 import eu.dnetlib.dhp.utils.DHPUtils
-import org.apache.commons.lang.StringUtils
+import org.apache.commons.lang3.StringUtils
 import org.apache.spark.sql.Row
 import org.json4s
 import org.json4s.DefaultFormats
@ -673,11 +673,12 @@ case object Crossref2Oaf {
    val doi = input.getString(0)
    val rorId = input.getString(1)
-    val pubId = s"50|${PidType.doi.toString.padTo(12, "_")}::${DoiCleaningRule.clean(doi)}"
+
    val pubId = IdentifierFactory.idFromPid("50", "doi", DoiCleaningRule.clean(doi), true)
    val affId = GenerateRorActionSetJob.calculateOpenaireId(rorId)
    val r: Relation = new Relation
-    DoiCleaningRule.clean(doi)
+
    r.setSource(pubId)
    r.setTarget(affId)
    r.setRelType(ModelConstants.RESULT_ORGANIZATION)
@ -978,7 +979,26 @@ case object Crossref2Oaf {
            case "10.13039/501100010790" =>
              generateSimpleRelationFromAward(funder, "erasmusplus_", a => a)
            case _ => logger.debug("no match for " + funder.DOI.get)
-
+            //Add for Danish funders
            //Independent Research Fund Denmark (IRFD)
            case "10.13039/501100004836" =>
              generateSimpleRelationFromAward(funder, "irfd________", a => a)
              val targetId = getProjectId("irfd________", "1e5e62235d094afd01cd56e65112fc63")
              queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
              queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
            //Carlsberg Foundation (CF)
            case "10.13039/501100002808" =>
              generateSimpleRelationFromAward(funder, "cf__________", a => a)
              val targetId = getProjectId("cf__________", "1e5e62235d094afd01cd56e65112fc63")
              queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
              queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
            //Novo Nordisk Foundation (NNF)
            case "10.13039/501100009708" =>
              generateSimpleRelationFromAward(funder, "nnf___________", a => a)
              val targetId = getProjectId("nnf_________", "1e5e62235d094afd01cd56e65112fc63")
              queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
              queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
            case _ => logger.debug("no match for " + funder.DOI.get)
          }
        } else {
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java
@ -98,9 +98,9 @@ public class PrepareAffiliationRelationsTest {
 					"-crossrefInputPath", crossrefAffiliationRelationPathNew,
 					"-pubmedInputPath", crossrefAffiliationRelationPath,
 					"-openapcInputPath", crossrefAffiliationRelationPathNew,
-					"-dataciteInputPath", crossrefAffiliationRelationPath,
+					"-dataciteInputPath", crossrefAffiliationRelationPathNew,
-					"-webCrawlInputPath", crossrefAffiliationRelationPath,
+					"-webCrawlInputPath", crossrefAffiliationRelationPathNew,
-					"-publisherInputPath", publisherAffiliationRelationOldPath,
+					"-publisherInputPath", publisherAffiliationRelationPath,
 					"-outputPath", outputPath
 				});
@ -112,7 +112,7 @@ public class PrepareAffiliationRelationsTest {
 			.map(aa -> ((Relation) aa.getPayload()));
 		// count the number of relations
-		assertEquals(150, tmp.count());// 18 + 24 *3 + 30 * 2 =
+		assertEquals(162, tmp.count());// 18 + 24 + 30 * 4 =
 		Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
 		dataset.createOrReplaceTempView("result");
@ -123,7 +123,7 @@ public class PrepareAffiliationRelationsTest {
 		// verify that we have equal number of bi-directional relations
 		Assertions
 			.assertEquals(
-				75, execVerification
+				81, execVerification
 					.filter(
 						"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
 					.collectAsList()
@ -131,7 +131,7 @@ public class PrepareAffiliationRelationsTest {
 		Assertions
 			.assertEquals(
-				75, execVerification
+				81, execVerification
 					.filter(
 						"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
 					.collectAsList()
@ -158,7 +158,7 @@ public class PrepareAffiliationRelationsTest {
 		Assertions
 			.assertEquals(
-				2, execVerification.filter("source = '" + publisherid + "' and target = '" + rorId + "'").count());
+				4, execVerification.filter("source = '" + publisherid + "' and target = '" + rorId + "'").count());
 		Assertions
 			.assertEquals(
@ -173,7 +173,7 @@ public class PrepareAffiliationRelationsTest {
 		Assertions
 			.assertEquals(
-				3, execVerification
+				1, execVerification
 					.filter(
 						"source = '" + ID_PREFIX
 							+ IdentifierFactory
--- a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml
@ -63,6 +63,7 @@
        <path start="copy_software"/>
        <path start="copy_datasource"/>
        <path start="copy_project"/>
        <path start="copy_person"/>
        <path start="copy_organization"/>
    </fork>
@ -120,6 +121,15 @@
        <error to="Kill"/>
    </action>
    <action name="copy_person">
        <distcp xmlns="uri:oozie:distcp-action:0.2">
            <arg>${nameNode}/${sourcePath}/person</arg>
            <arg>${nameNode}/${outputPath}/person</arg>
        </distcp>
        <ok to="wait"/>
        <error to="Kill"/>
    </action>
    <action name="copy_datasource">
        <distcp xmlns="uri:oozie:distcp-action:0.2">
            <arg>${nameNode}/${sourcePath}/datasource</arg>
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java
@ -2,14 +2,13 @@
 package eu.dnetlib.dhp.oa.dedup;
 import java.util.*;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.commons.beanutils.BeanUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.api.java.function.FlatMapGroupsFunction;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.ReduceFunction;
 import org.apache.spark.sql.*;
 import eu.dnetlib.dhp.oa.dedup.model.Identifier;
@ -107,6 +106,8 @@ public class DedupRecordFactory {
 					final HashSet<String> acceptanceDate = new HashSet<>();
 					boolean isVisible = false;
 					while (it.hasNext()) {
 						Tuple3<String, String, OafEntity> t = it.next();
 						OafEntity entity = t._3();
@ -114,6 +115,7 @@ public class DedupRecordFactory {
 						if (entity == null) {
 							aliases.add(t._2());
 						} else {
 							isVisible = isVisible || !entity.getDataInfo().getInvisible();
 							cliques.add(entity);
 							if (acceptanceDate.size() < MAX_ACCEPTANCE_DATE) {
@ -129,13 +131,20 @@ public class DedupRecordFactory {
 					}
-					if (acceptanceDate.size() >= MAX_ACCEPTANCE_DATE || cliques.isEmpty()) {
+					if (!isVisible || acceptanceDate.size() >= MAX_ACCEPTANCE_DATE || cliques.isEmpty()) {
 						return Collections.emptyIterator();
 					}
-					OafEntity mergedEntity = MergeUtils.mergeGroup(dedupId, cliques.iterator());
+					OafEntity mergedEntity = MergeUtils.mergeGroup(cliques.iterator());
 					// dedup records do not have date of transformation attribute
 					mergedEntity.setDateoftransformation(null);
 					mergedEntity
 						.setMergedIds(
 							Stream
 								.concat(cliques.stream().map(OafEntity::getId), aliases.stream())
 								.distinct()
 								.sorted()
 								.collect(Collectors.toList()));
 					return Stream
 						.concat(
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java
@ -91,7 +91,6 @@ public class SparkBlockStats extends AbstractSparkAction {
 				.read()
 				.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
 				.transform(deduper.model().parseJsonDataset())
 				.transform(deduper.filterAndCleanup())
 				.transform(deduper.generateClustersWithCollect())
 				.filter(functions.size(new Column("block")).geq(1));
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java
@ -5,11 +5,11 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTION
 import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVENANCE_DEDUP;
 import java.io.IOException;
 import java.util.Arrays;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
-import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.*;
 import org.apache.spark.sql.SparkSession;
 import org.dom4j.DocumentException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -17,6 +17,7 @@ import org.xml.sax.SAXException;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.schema.common.EntityType;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.DataInfo;
 import eu.dnetlib.dhp.schema.oaf.OafEntity;
@ -25,6 +26,8 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 import eu.dnetlib.pace.config.DedupConfig;
 import scala.collection.JavaConversions;
 import scala.collection.JavaConverters;
 public class SparkCreateDedupRecord extends AbstractSparkAction {
@ -85,6 +88,36 @@ public class SparkCreateDedupRecord extends AbstractSparkAction {
 				.mode(SaveMode.Overwrite)
 				.option("compression", "gzip")
 				.json(outputPath);
 			log.info("Updating mergerels for: '{}'", subEntity);
 			final Dataset<Row> dedupIds = spark
 				.read()
 				.schema("`id` STRING, `mergedIds` ARRAY<STRING>")
 				.json(outputPath)
 				.selectExpr("id as source", "explode(mergedIds) as target");
 			spark
 				.read()
 				.load(mergeRelPath)
 				.where("relClass == 'merges'")
 				.join(dedupIds, JavaConversions.asScalaBuffer(Arrays.asList("source", "target")), "left_semi")
 				.write()
 				.mode(SaveMode.Overwrite)
 				.option("compression", "gzip")
 				.save(workingPath + "/mergerel_filtered");
 			final Dataset<Row> validRels = spark.read().load(workingPath + "/mergerel_filtered");
 			final Dataset<Row> filteredMergeRels = validRels
 				.union(
 					validRels
 						.withColumnRenamed("source", "source_tmp")
 						.withColumnRenamed("target", "target_tmp")
 						.withColumn("relClass", functions.lit(ModelConstants.IS_MERGED_IN))
 						.withColumnRenamed("target_tmp", "source")
 						.withColumnRenamed("source_tmp", "target"));
 			saveParquet(filteredMergeRels, mergeRelPath, SaveMode.Overwrite);
 			removeOutputDir(spark, workingPath + "/mergerel_filtered");
 		}
 	}
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
@ -69,6 +69,7 @@ public class SparkPropagateRelation extends AbstractSparkAction {
 		Dataset<Relation> mergeRels = spark
 			.read()
 			.schema(REL_BEAN_ENC.schema())
 			.load(DedupUtility.createMergeRelPath(workingPath, "*", "*"))
 			.as(REL_BEAN_ENC);
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DatasetMergerTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DatasetMergerTest.java
@ -46,8 +46,8 @@ class DatasetMergerTest implements Serializable {
 	}
 	@Test
-	void datasetMergerTest() throws InstantiationException, IllegalAccessException, InvocationTargetException {
+	void datasetMergerTest() {
-		Dataset pub_merged = MergeUtils.mergeGroup(dedupId, datasets.stream().map(Tuple2::_2).iterator());
+		Dataset pub_merged = MergeUtils.mergeGroup(datasets.stream().map(Tuple2::_2).iterator());
 		// verify id
 		assertEquals(dedupId, pub_merged.getId());
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json
@ -96,7 +96,7 @@
        "aggregation": "MAX",
        "positive": "layer4",
        "negative": "NO_MATCH",
-        "undefined": "MATCH",
+        "undefined": "layer4",
        "ignoreUndefined": "true"
      },
      "layer4": {
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
@ -7,7 +7,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactor
 import eu.dnetlib.dhp.utils.DHPUtils
 import eu.dnetlib.doiboost.DoiBoostMappingUtil
 import eu.dnetlib.doiboost.DoiBoostMappingUtil._
-import org.apache.commons.lang.StringUtils
+import org.apache.commons.lang3.StringUtils
 import org.json4s
 import org.json4s.DefaultFormats
 import org.json4s.JsonAST._
@ -560,9 +560,32 @@ case object Crossref2Oaf {
                "10.13039/501100000266" | "10.13039/501100006041" | "10.13039/501100000265" | "10.13039/501100000270" |
                "10.13039/501100013589" | "10.13039/501100000271" =>
              generateSimpleRelationFromAward(funder, "ukri________", a => a)
-
+            //DFG
            case "10.13039/501100001659" =>
              val targetId = getProjectId("dfgf________", "1e5e62235d094afd01cd56e65112fc63")
              queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
              queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
            case _ => logger.debug("no match for " + funder.DOI.get)
            //Add for Danish funders
            //Independent Research Fund Denmark (IRFD)
            case "10.13039/501100004836" =>
              generateSimpleRelationFromAward(funder, "irfd________", a => a)
              val targetId = getProjectId("irfd________", "1e5e62235d094afd01cd56e65112fc63")
              queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
              queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
            //Carlsberg Foundation (CF)
            case "10.13039/501100002808" =>
              generateSimpleRelationFromAward(funder, "cf__________", a => a)
              val targetId = getProjectId("cf__________", "1e5e62235d094afd01cd56e65112fc63")
              queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
              queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
            //Novo Nordisk Foundation (NNF)
            case "10.13039/501100009708" =>
              generateSimpleRelationFromAward(funder, "nnf___________", a => a)
              val targetId = getProjectId("nnf_________", "1e5e62235d094afd01cd56e65112fc63")
              queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
              queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
            case _ => logger.debug("no match for " + funder.DOI.get)
          }
        } else {
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala
@ -6,7 +6,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
 import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication}
 import eu.dnetlib.doiboost.DoiBoostMappingUtil
 import eu.dnetlib.doiboost.DoiBoostMappingUtil.{createSP, generateDataInfo}
-import org.apache.commons.lang.StringUtils
+import org.apache.commons.lang3.StringUtils
 import org.json4s
 import org.json4s.DefaultFormats
 import org.json4s.JsonAST._
--- a/dhp-workflows/dhp-enrichment/pom.xml
+++ b/dhp-workflows/dhp-enrichment/pom.xml
@ -48,12 +48,7 @@
            <groupId>io.github.classgraph</groupId>
            <artifactId>classgraph</artifactId>
        </dependency>
-        <dependency>
+
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-aggregation</artifactId>
            <version>1.2.5-SNAPSHOT</version>
            <scope>compile</scope>
        </dependency>
    </dependencies>
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/Utils.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/Utils.java
@ -6,11 +6,11 @@ import java.io.Serializable;
 import java.util.*;
 import java.util.stream.Collectors;
 import org.apache.commons.lang3.StringUtils;
 import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.amazonaws.util.StringUtils;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.collect.Maps;
@ -81,7 +81,7 @@ public class Utils implements Serializable {
 		Community c = new Community();
 		c.setId(cm.getId());
 		c.setZenodoCommunities(cm.getOtherZenodoCommunities());
-		if (!StringUtils.isNullOrEmpty(cm.getZenodoCommunity()))
+		if (StringUtils.isNotBlank(cm.getZenodoCommunity()))
 			c.getZenodoCommunities().add(cm.getZenodoCommunity());
 		c.setSubjects(cm.getSubjects());
 		c.getSubjects().addAll(cm.getFos());
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/model/CommunityContentprovider.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/model/CommunityContentprovider.java
@ -13,13 +13,13 @@ public class CommunityContentprovider {
 	private String openaireId;
 	private SelectionConstraints selectioncriteria;
-	private String enabled;
+	private Boolean enabled;
-	public String getEnabled() {
+	public Boolean getEnabled() {
 		return enabled;
 	}
-	public void setEnabled(String enabled) {
+	public void setEnabled(Boolean enabled) {
 		this.enabled = enabled;
 	}
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java
@ -4,7 +4,7 @@ package eu.dnetlib.dhp.bulktag.community;
 import java.io.Serializable;
 import java.lang.reflect.InvocationTargetException;
-import org.apache.htrace.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonIgnore;
 import eu.dnetlib.dhp.bulktag.criteria.Selection;
 import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/person/SparkExtractPersonRelations.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/person/SparkExtractPersonRelations.java
@ -0,0 +1,302 @@
 package eu.dnetlib.dhp.person;
 import static com.ibm.icu.text.PluralRules.Operand.w;
 import static eu.dnetlib.dhp.PropagationConstant.*;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.io.Serializable;
 import java.util.*;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.MapGroupsFunction;
 import org.apache.spark.sql.*;
 import org.apache.spark.sql.Dataset;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.person.CoAuthorshipIterator;
 import eu.dnetlib.dhp.common.person.Coauthors;
 import eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
 import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
 import scala.Tuple2;
 public class SparkExtractPersonRelations {
 	private static final Logger log = LoggerFactory.getLogger(SparkCountryPropagationJob.class);
 	private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class) + "|orcid_______";
 	public static final DataInfo DATAINFO = OafMapperUtils
 		.dataInfo(
 			false,
 			"openaire",
 			true,
 			false,
 			OafMapperUtils
 				.qualifier(
 					ModelConstants.SYSIMPORT_CROSSWALK_REPOSITORY,
 					ModelConstants.SYSIMPORT_CROSSWALK_REPOSITORY,
 					ModelConstants.DNET_PROVENANCE_ACTIONS,
 					ModelConstants.DNET_PROVENANCE_ACTIONS),
 			"0.85");
 	public static void main(String[] args) throws Exception {
 		String jsonConfiguration = IOUtils
 			.toString(
 				SparkCountryPropagationJob.class
 					.getResourceAsStream(
 						"/eu/dnetlib/dhp/wf/subworkflows/person/input_personpropagation_parameters.json"));
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
 		parser.parseArgument(args);
 		Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 		String sourcePath = parser.get("sourcePath");
 		log.info("sourcePath: {}", sourcePath);
 		final String workingPath = parser.get("outputPath");
 		log.info("workingPath: {}", workingPath);
 		SparkConf conf = new SparkConf();
 		runWithSparkSession(
 			conf,
 			isSparkSessionManaged,
 			spark -> {
 				extractRelations(
 					spark,
 					sourcePath,
 					workingPath);
 				removeIsolatedPerson(spark, sourcePath, workingPath);
 			});
 	}
 	private static void removeIsolatedPerson(SparkSession spark, String sourcePath, String workingPath) {
 		Dataset<Person> personDataset = spark
 			.read()
 			.schema(Encoders.bean(Person.class).schema())
 			.json(sourcePath + "person")
 			.as(Encoders.bean(Person.class));
 		Dataset<Relation> relationDataset = spark
 			.read()
 			.schema(Encoders.bean(Relation.class).schema())
 			.json(sourcePath + "relation")
 			.as(Encoders.bean(Relation.class));
 		personDataset
 			.join(relationDataset, personDataset.col("id").equalTo(relationDataset.col("source")), "left_semi")
 			.write()
 			.option("compression", "gzip")
 			.mode(SaveMode.Overwrite)
 			.json(workingPath + "person");
 		spark
 			.read()
 			.schema(Encoders.bean(Person.class).schema())
 			.json(workingPath + "person")
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(sourcePath + "person");
 	}
 	private static void extractRelations(SparkSession spark, String sourcePath, String workingPath) {
 		Dataset<Tuple2<String, Relation>> relationDataset = spark
 			.read()
 			.schema(Encoders.bean(Relation.class).schema())
 			.json(sourcePath + "relation")
 			.as(Encoders.bean(Relation.class))
 			.map(
 				(MapFunction<Relation, Tuple2<String, Relation>>) r -> new Tuple2<>(
 					r.getSource() + r.getRelClass() + r.getTarget(), r),
 				Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class)));
 		ModelSupport.entityTypes
 			.keySet()
 			.stream()
 			.filter(ModelSupport::isResult)
 			.forEach(
 				e -> {
 					// 1. search for results having orcid_pending and orcid in the set of pids for the authors
 					Dataset<Result> resultWithOrcids = spark
 						.read()
 						.schema(Encoders.bean(Result.class).schema())
 						.json(sourcePath + e.name())
 						.as(Encoders.bean(Result.class))
 						.filter(
 							(FilterFunction<Result>) r -> !r.getDataInfo().getDeletedbyinference() &&
 								!r.getDataInfo().getInvisible() &&
 								Optional
 									.ofNullable(r.getAuthor())
 									.isPresent())
 						.filter(
 							(FilterFunction<Result>) r -> r
 								.getAuthor()
 								.stream()
 								.anyMatch(
 									a -> Optional
 										.ofNullable(
 											a
 												.getPid())
 										.isPresent() &&
 										a
 											.getPid()
 											.stream()
 											.anyMatch(
 												p -> Arrays
 													.asList("orcid", "orcid_pending")
 													.contains(p.getQualifier().getClassid().toLowerCase()))));
 					// 2. create authorship relations between the result identifier and the person entity with
 					// orcid_pending.
 					Dataset<Tuple2<String, Relation>> newRelations = resultWithOrcids
 						.flatMap(
 							(FlatMapFunction<Result, Relation>) r -> getAuthorshipRelations(r),
 							Encoders.bean(Relation.class))
 //							.groupByKey((MapFunction<Relation, String>) r-> r.getSource()+r.getTarget(), Encoders.STRING() )
 //							.mapGroups((MapGroupsFunction<String, Relation, Relation>) (k,it) -> it.next(), Encoders.bean(Relation.class) )
 						.map(
 							(MapFunction<Relation, Tuple2<String, Relation>>) r -> new Tuple2<>(
 								r.getSource() + r.getRelClass() + r.getTarget(), r),
 							Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class)));
 					newRelations
 						.joinWith(relationDataset, newRelations.col("_1").equalTo(relationDataset.col("_1")), "left")
 						.map((MapFunction<Tuple2<Tuple2<String, Relation>, Tuple2<String, Relation>>, Relation>) t2 -> {
 							if (t2._2() == null)
 								return t2._1()._2();
 							return null;
 						}, Encoders.bean(Relation.class))
 						.filter((FilterFunction<Relation>) r -> r != null)
 						.write()
 						.mode(SaveMode.Append)
 						.option("compression", "gzip")
 						.json(workingPath);
 					// 2.1 store in a separate location the relation between the person and the pids for the result?
 					// 3. create co_authorship relations between the pairs of authors with orcid/orcid_pending pids
 					newRelations = resultWithOrcids
 						.map((MapFunction<Result, Coauthors>) r -> getAuthorsPidList(r), Encoders.bean(Coauthors.class))
 						.flatMap(
 							(FlatMapFunction<Coauthors, Relation>) c -> new CoAuthorshipIterator(c.getCoauthors()),
 							Encoders.bean(Relation.class))
 						.groupByKey(
 							(MapFunction<Relation, String>) r -> r.getSource() + r.getTarget(), Encoders.STRING())
 						.mapGroups(
 							(MapGroupsFunction<String, Relation, Relation>) (k, it) -> it.next(),
 							Encoders.bean(Relation.class))
 						.map(
 							(MapFunction<Relation, Tuple2<String, Relation>>) r -> new Tuple2<>(
 								r.getSource() + r.getRelClass() + r.getTarget(), r),
 							Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class)));
 					newRelations
 						.joinWith(relationDataset, newRelations.col("_1").equalTo(relationDataset.col("_1")), "left")
 						.map((MapFunction<Tuple2<Tuple2<String, Relation>, Tuple2<String, Relation>>, Relation>) t2 -> {
 							if (t2._2() == null)
 								return t2._1()._2();
 							return null;
 						}, Encoders.bean(Relation.class))
 						.filter((FilterFunction<Relation>) r -> r != null)
 						.write()
 						.mode(SaveMode.Append)
 						.option("compression", "gzip")
 						.json(workingPath);
 				});
 		spark
 			.read()
 			.schema(Encoders.bean(Relation.class).schema())
 			.json(workingPath)
 			.write()
 			.mode(SaveMode.Append)
 			.option("compression", "gzip")
 			.json(sourcePath + "relation");
 	}
 	private static Coauthors getAuthorsPidList(Result r) {
 		Coauthors coauth = new Coauthors();
 		coauth
 			.setCoauthors(
 				r
 					.getAuthor()
 					.stream()
 					.filter(
 						a -> a
 							.getPid()
 							.stream()
 							.anyMatch(
 								p -> Arrays.asList("orcid", "orcid_pending").contains(p.getQualifier().getClassid())))
 					.map(a -> {
 						Optional<StructuredProperty> tmp = a
 							.getPid()
 							.stream()
 							.filter(p -> p.getQualifier().getClassid().equalsIgnoreCase("orcid"))
 							.findFirst();
 						if (tmp.isPresent())
 							return tmp.get().getValue();
 						tmp = a
 							.getPid()
 							.stream()
 							.filter(p -> p.getQualifier().getClassid().equalsIgnoreCase("orcid_pending"))
 							.findFirst();
 						if (tmp.isPresent())
 							return tmp.get().getValue();
 						return null;
 					})
 					.filter(Objects::nonNull)
 					.collect(Collectors.toList()));
 		return coauth;
 	}
 	private static Iterator<Relation> getAuthorshipRelations(Result r) {
 		List<Relation> relationList = new ArrayList<>();
 		for (Author a : r.getAuthor())
 			relationList.addAll(a.getPid().stream().map(p -> {
 				if (p.getQualifier().getClassid().equalsIgnoreCase("orcid_pending"))
 					return getRelation(p.getValue(), r.getId());
 				return null;
 			})
 				.filter(Objects::nonNull)
 				.collect(Collectors.toList()));
 		return relationList.iterator();
 	}
 	private static Relation getRelation(String orcid, String resultId) {
 		String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid);
 		Relation relation = OafMapperUtils
 			.getRelation(
 				source, resultId, ModelConstants.RESULT_PERSON_RELTYPE,
 				ModelConstants.RESULT_PERSON_SUBRELTYPE,
 				ModelConstants.RESULT_PERSON_HASAUTHORED,
 				null, // collectedfrom = null
 				DATAINFO,
 				null);
 		return relation;
 	}
 }
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/import.txt
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/import.txt
@ -7,4 +7,5 @@ community_organization classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunit
 result_project classpath eu/dnetlib/dhp/wf/subworkflows/projecttoresult/oozie_app
 community_project classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromproject/oozie_app
 community_sem_rel classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/oozie_app
-country_propagation classpath eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app
+country_propagation classpath eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app
 person_propagation classpath eu/dnetlib/dhp/wf/subworkflows/person/oozie_app
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/workflow.xml
@ -122,6 +122,7 @@
            <case to="community_project">${wf:conf('resumeFrom') eq 'CommunityProject'}</case>
            <case to="community_sem_rel">${wf:conf('resumeFrom') eq 'CommunitySemanticRelation'}</case>
            <case to="country_propagation">${wf:conf('resumeFrom') eq 'CountryPropagation'}</case>
            <case to="person_propagation">${wf:conf('resumeFrom') eq 'PersonPropagation'}</case>
            <default to="orcid_propagation"/>
        </switch>
    </decision>
@ -291,10 +292,24 @@
                </property>
            </configuration>
        </sub-workflow>
        <ok to="person_propagation" />
        <error to="Kill" />
    </action>
    <action name="person_propagation">
        <sub-workflow>
            <app-path>${wf:appPath()}/person_propagation
            </app-path>
            <propagate-configuration/>
            <configuration>
                <property>
                    <name>sourcePath</name>
                    <value>${outputPath}</value>
                </property>
            </configuration>
        </sub-workflow>
        <ok to="country_propagation" />
        <error to="Kill" />
    </action>
    <action name="country_propagation">
        <sub-workflow>
            <app-path>${wf:appPath()}/country_propagation
@ -319,6 +334,8 @@
        <error to="Kill" />
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/oozie_app/workflow.xml
@ -34,6 +34,7 @@
        <path start="copy_organization"/>
        <path start="copy_projects"/>
        <path start="copy_datasources"/>
        <path start="copy_persons"/>
    </fork>
    <action name="copy_relation">
@ -80,6 +81,17 @@
        <error to="Kill"/>
    </action>
    <action name="copy_persons">
        <distcp xmlns="uri:oozie:distcp-action:0.2">
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <arg>${nameNode}/${sourcePath}/person</arg>
            <arg>${nameNode}/${outputPath}/person</arg>
        </distcp>
        <ok to="copy_wait"/>
        <error to="Kill"/>
    </action>
    <join name="copy_wait" to="fork_prepare_assoc_step1"/>
    <fork name="fork_prepare_assoc_step1">
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/input_personpropagation_parameters.json
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/input_personpropagation_parameters.json
@ -0,0 +1,21 @@
 [
  {
    "paramName":"s",
    "paramLongName":"sourcePath",
    "paramDescription": "the path of the sequencial file to read",
    "paramRequired": true
  },
  {
    "paramName": "out",
    "paramLongName": "outputPath",
    "paramDescription": "the path used to store temporary output files",
    "paramRequired": true
  },
  {
    "paramName": "ssm",
    "paramLongName": "isSparkSessionManaged",
    "paramDescription": "true if the spark session is managed, false otherwise",
    "paramRequired": false
  }
 ]
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/job.properties
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/job.properties
@ -0,0 +1 @@
 sourcePath=/tmp/miriam/13_graph_copy
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/oozie_app/config-default.xml
@ -0,0 +1,58 @@
 <configuration>
    <property>
        <name>jobTracker</name>
        <value>yarnRM</value>
    </property>
    <property>
        <name>nameNode</name>
        <value>hdfs://nameservice1</value>
    </property>
    <property>
        <name>oozie.use.system.libpath</name>
        <value>true</value>
    </property>
    <property>
        <name>oozie.action.sharelib.for.spark</name>
        <value>spark2</value>
    </property>
    <property>
        <name>hive_metastore_uris</name>
        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
    </property>
    <property>
        <name>spark2YarnHistoryServerAddress</name>
        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
    </property>
    <property>
        <name>spark2EventLogDir</name>
        <value>/user/spark/spark2ApplicationHistory</value>
    </property>
    <property>
        <name>spark2ExtraListeners</name>
        <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
    </property>
    <property>
        <name>spark2SqlQueryExecutionListeners</name>
        <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
    </property>
    <property>
        <name>sparkExecutorNumber</name>
        <value>4</value>
    </property>
    <property>
        <name>sparkDriverMemory</name>
        <value>15G</value>
    </property>
    <property>
        <name>sparkExecutorMemory</name>
        <value>5G</value>
    </property>
    <property>
        <name>sparkExecutorCores</name>
        <value>4</value>
    </property>
    <property>
        <name>spark2MaxExecutors</name>
        <value>50</value>
    </property>
 </configuration>
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/oozie_app/workflow.xml
@ -0,0 +1,68 @@
 <workflow-app name="person_propagation" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>sourcePath</name>
            <description>the source path</description>
        </property>
    </parameters>
    <global>
        <job-tracker>${jobTracker}</job-tracker>
        <name-node>${nameNode}</name-node>
        <configuration>
            <property>
                <name>oozie.action.sharelib.for.spark</name>
                <value>${oozieActionShareLibForSpark2}</value>
            </property>
        </configuration>
    </global>
    <start to="reset_outputpath"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="reset_outputpath">
        <fs>
            <delete path="${workingDir}"/>
            <mkdir path="${workingDir}"/>
        </fs>
        <ok to="extract_person_relation_from_graph"/>
        <error to="Kill"/>
    </action>
    <action name="extract_person_relation_from_graph">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>personPropagation</name>
            <class>eu.dnetlib.dhp.person.SparkExtractPersonRelations</class>
            <jar>dhp-enrichment-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.speculation=false
                --conf spark.hadoop.mapreduce.map.speculative=false
                --conf spark.hadoop.mapreduce.reduce.speculative=false
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/</arg>
            <arg>--outputPath</arg><arg>${workingDir}/relation</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/person/PersonPropagationJobTest.java
+++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/person/PersonPropagationJobTest.java
@ -0,0 +1,93 @@
 package eu.dnetlib.dhp.person;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.List;
 import org.apache.commons.io.FileUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob;
 import eu.dnetlib.dhp.schema.oaf.*;
 import scala.Tuple2;
 public class PersonPropagationJobTest {
 	private static final Logger log = LoggerFactory.getLogger(PersonPropagationJobTest.class);
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	private static SparkSession spark;
 	private static Path workingDir;
 	@BeforeAll
 	public static void beforeAll() throws IOException {
 		workingDir = Files.createTempDirectory(PersonPropagationJobTest.class.getSimpleName());
 		log.info("using work dir {}", workingDir);
 		SparkConf conf = new SparkConf();
 		conf.setAppName(PersonPropagationJobTest.class.getSimpleName());
 		conf.setMaster("local[*]");
 		conf.set("spark.driver.host", "localhost");
 		conf.set("hive.metastore.local", "true");
 		conf.set("spark.ui.enabled", "false");
 		conf.set("spark.sql.warehouse.dir", workingDir.toString());
 		conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
 		spark = SparkSession
 			.builder()
 			.appName(PersonPropagationJobTest.class.getSimpleName())
 			.config(conf)
 			.getOrCreate();
 	}
 	@AfterAll
 	public static void afterAll() throws IOException {
 		FileUtils.deleteDirectory(workingDir.toFile());
 		spark.stop();
 	}
 	@Test
 	void testPersonPropagation() throws Exception {
 		final String sourcePath = getClass()
 			.getResource("/eu/dnetlib/dhp/personpropagation/graph")
 			.getPath();
 		SparkExtractPersonRelations
 			.main(
 				new String[] {
 					"--isSparkSessionManaged", Boolean.FALSE.toString(),
 					"--sourcePath", sourcePath,
 					"--outputPath", workingDir.toString()
 				});
 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 		JavaRDD<Relation> tmp = sc
 			.textFile(workingDir.toString() + "/relation")
 			.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
 		// TODO write assertions and find relevant information for hte resource files
 	}
 }
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/dataset/part-00000
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/dataset/part-00000
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/otherresearchproduct/part-00000
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/otherresearchproduct/part-00000
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/publication/part-00000
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/publication/part-00000
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/relation/part-00000
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/relation/part-00000
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/software/part-00000
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/software/part-00000
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java
@ -72,9 +72,9 @@ public class GraphHiveTableImporterJob {
 		final Encoder<T> clazzEncoder = Encoders.bean(clazz);
 		Dataset<Row> dataset = spark
-				.read()
+			.read()
-				.schema(clazzEncoder.schema())
+			.schema(clazzEncoder.schema())
-				.json(inputPath);
+			.json(inputPath);
 		if (numPartitions > 0) {
 			log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions);
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
@ -153,34 +153,40 @@ public abstract class AbstractMdRecordToOafMapper {
 			final DataInfo entityInfo = prepareDataInfo(doc, this.invisible);
 			final long lastUpdateTimestamp = new Date().getTime();
-			final List<Instance> instances = prepareInstances(doc, entityInfo, collectedFrom, hostedBy);
+			final Instance instance = prepareInstances(doc, entityInfo, collectedFrom, hostedBy);
-			final String type = getResultType(doc, instances);
+			if (!Optional
 				.ofNullable(instance.getInstancetype())
 				.map(Qualifier::getClassid)
 				.filter(StringUtils::isNotBlank)
 				.isPresent()) {
 				return Lists.newArrayList();
 			}
-			return createOafs(doc, type, instances, collectedFrom, entityInfo, lastUpdateTimestamp);
+			final String type = getResultType(instance);
 			return createOafs(doc, type, instance, collectedFrom, entityInfo, lastUpdateTimestamp);
 		} catch (final DocumentException e) {
 			log.error("Error with record:\n" + xml);
 			return Lists.newArrayList();
 		}
 	}
-	protected String getResultType(final Document doc, final List<Instance> instances) {
+	protected String getResultType(final Instance instance) {
-		final String type = doc.valueOf("//dr:CobjCategory/@type");
+		if (this.vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
 		if (StringUtils.isBlank(type) && this.vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
 			final String instanceType = instances
 				.stream()
 				.map(i -> i.getInstancetype().getClassid())
 				.findFirst()
 				.filter(s -> !UNKNOWN.equalsIgnoreCase(s))
 				.orElse("0000"); // Unknown
 			return Optional
-				.ofNullable(this.vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType))
+				.ofNullable(instance.getInstancetype())
 				.map(Qualifier::getClassid)
 				.map(
 					instanceType -> Optional
 						.ofNullable(
 							this.vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType))
 						.map(Qualifier::getClassid)
 						.orElse("0000"))
 				.orElse("0000");
 		} else {
 			throw new IllegalStateException("Missing vocabulary: " + ModelConstants.DNET_RESULT_TYPOLOGIES);
 		}
 		return type;
 	}
 	private KeyValue getProvenanceDatasource(final Document doc, final String xpathId, final String xpathName) {
@ -197,12 +203,12 @@ public abstract class AbstractMdRecordToOafMapper {
 	protected List<Oaf> createOafs(
 		final Document doc,
 		final String type,
-		final List<Instance> instances,
+		final Instance instance,
 		final KeyValue collectedFrom,
 		final DataInfo info,
 		final long lastUpdateTimestamp) {
-		final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
+		final OafEntity entity = createEntity(doc, type, instance, collectedFrom, info, lastUpdateTimestamp);
 		final Set<String> originalId = Sets.newHashSet(entity.getOriginalId());
 		originalId.add(entity.getId());
@ -235,19 +241,19 @@ public abstract class AbstractMdRecordToOafMapper {
 	private OafEntity createEntity(final Document doc,
 		final String type,
-		final List<Instance> instances,
+		final Instance instance,
 		final KeyValue collectedFrom,
 		final DataInfo info,
 		final long lastUpdateTimestamp) {
 		switch (type.toLowerCase()) {
 			case "publication":
 				final Publication p = new Publication();
-				populateResultFields(p, doc, instances, collectedFrom, info, lastUpdateTimestamp);
+				populateResultFields(p, doc, instance, collectedFrom, info, lastUpdateTimestamp);
 				p.setJournal(prepareJournal(doc, info));
 				return p;
 			case "dataset":
 				final Dataset d = new Dataset();
-				populateResultFields(d, doc, instances, collectedFrom, info, lastUpdateTimestamp);
+				populateResultFields(d, doc, instance, collectedFrom, info, lastUpdateTimestamp);
 				d.setStoragedate(prepareDatasetStorageDate(doc, info));
 				d.setDevice(prepareDatasetDevice(doc, info));
 				d.setSize(prepareDatasetSize(doc, info));
@ -258,7 +264,7 @@ public abstract class AbstractMdRecordToOafMapper {
 				return d;
 			case "software":
 				final Software s = new Software();
-				populateResultFields(s, doc, instances, collectedFrom, info, lastUpdateTimestamp);
+				populateResultFields(s, doc, instance, collectedFrom, info, lastUpdateTimestamp);
 				s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info));
 				s.setLicense(prepareSoftwareLicenses(doc, info));
 				s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info));
@ -268,7 +274,7 @@ public abstract class AbstractMdRecordToOafMapper {
 			case "otherresearchproducts":
 			default:
 				final OtherResearchProduct o = new OtherResearchProduct();
-				populateResultFields(o, doc, instances, collectedFrom, info, lastUpdateTimestamp);
+				populateResultFields(o, doc, instance, collectedFrom, info, lastUpdateTimestamp);
 				o.setContactperson(prepareOtherResearchProductContactPersons(doc, info));
 				o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info));
 				o.setTool(prepareOtherResearchProductTools(doc, info));
@ -415,7 +421,7 @@ public abstract class AbstractMdRecordToOafMapper {
 	private void populateResultFields(
 		final Result r,
 		final Document doc,
-		final List<Instance> instances,
+		final Instance instance,
 		final KeyValue collectedFrom,
 		final DataInfo info,
 		final long lastUpdateTimestamp) {
@ -449,8 +455,8 @@ public abstract class AbstractMdRecordToOafMapper {
 		r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
 		r.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
 		r.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
-		r.setInstance(instances);
+		r.setInstance(Arrays.asList(instance));
-		r.setBestaccessright(OafMapperUtils.createBestAccessRights(instances));
+		r.setBestaccessright(OafMapperUtils.createBestAccessRights(Arrays.asList(instance)));
 		r.setEoscifguidelines(prepareEOSCIfGuidelines(doc, info));
 	}
@ -509,7 +515,7 @@ public abstract class AbstractMdRecordToOafMapper {
 	protected abstract Qualifier prepareResourceType(Document doc, DataInfo info);
-	protected abstract List<Instance> prepareInstances(
+	protected abstract Instance prepareInstances(
 		Document doc,
 		DataInfo info,
 		KeyValue collectedfrom,
@ -657,13 +663,21 @@ public abstract class AbstractMdRecordToOafMapper {
 			final Node n = (Node) o;
 			final String classId = n.valueOf(xpathClassId).trim();
 			if (this.vocs.termExists(schemeId, classId)) {
-				res
+				final String value = n.getText();
-					.add(
+				if (StringUtils.isNotBlank(value)) {
-						HashableStructuredProperty
+					res
-							.newInstance(n.getText(), this.vocs.getTermAsQualifier(schemeId, classId), info));
+						.add(
 							HashableStructuredProperty
 								.newInstance(value, this.vocs.getTermAsQualifier(schemeId, classId), info));
 				}
 			}
 		}
-		return Lists.newArrayList(res);
+		return res
 			.stream()
 			.filter(Objects::nonNull)
 			.filter(p -> StringUtils.isNotBlank(p.getValue()))
 			.filter(p -> StringUtils.isNotBlank(p.getValue().trim()))
 			.collect(Collectors.toList());
 	}
 	protected List<StructuredProperty> prepareListStructProps(
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java
@ -133,7 +133,7 @@ public class GenerateEntitiesApplication extends AbstractMigrationApplication {
 					inputRdd
 						.keyBy(oaf -> ModelSupport.idFn().apply(oaf))
 						.groupByKey()
-						.map(t -> MergeUtils.mergeGroup(t._1, t._2.iterator())),
+						.map(t -> MergeUtils.mergeGroup(t._2.iterator())),
 					// .mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf))
 					// .reduceByKey(MergeUtils::merge)
 					// .map(Tuple2::_2),
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
@ -519,6 +519,28 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
 						r1 = setRelationSemantic(r1, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO);
 						r2 = setRelationSemantic(r2, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO);
 						break;
 					case "resultOrganization_affiliation_isAuthorInstitutionOf":
 						if (!"organization".equals(sourceType)) {
 							throw new IllegalStateException(
 								String
 									.format(
 										"invalid claim, sourceId: %s, targetId: %s, semantics: %s", sourceId, targetId,
 										semantics));
 						}
 						r1 = setRelationSemantic(r1, RESULT_ORGANIZATION, AFFILIATION, IS_AUTHOR_INSTITUTION_OF);
 						r2 = setRelationSemantic(r2, RESULT_ORGANIZATION, AFFILIATION, HAS_AUTHOR_INSTITUTION);
 						break;
 					case "resultOrganization_affiliation_hasAuthorInstitution":
 						if (!"organization".equals(targetType)) {
 							throw new IllegalStateException(
 								String
 									.format(
 										"invalid claim, sourceId: %s, targetId: %s, semantics: %s", sourceId, targetId,
 										semantics));
 						}
 						r1 = setRelationSemantic(r1, RESULT_ORGANIZATION, AFFILIATION, HAS_AUTHOR_INSTITUTION);
 						r2 = setRelationSemantic(r2, RESULT_ORGANIZATION, AFFILIATION, IS_AUTHOR_INSTITUTION_OF);
 						break;
 					default:
 						throw new IllegalArgumentException("claim semantics not managed: " + semantics);
 				}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java
@ -135,7 +135,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
 	}
 	@Override
-	protected List<Instance> prepareInstances(
+	protected Instance prepareInstances(
 		final Document doc,
 		final DataInfo info,
 		final KeyValue collectedfrom,
@ -197,7 +197,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
 			instance.getUrl().addAll(validUrl);
 		}
-		return Lists.newArrayList(instance);
+		return instance;
 	}
 	/**
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java
@ -126,7 +126,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
 	}
 	@Override
-	protected List<Instance> prepareInstances(
+	protected Instance prepareInstances(
 		final Document doc,
 		final DataInfo info,
 		final KeyValue collectedfrom,
@ -210,7 +210,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
 			instance.setUrl(new ArrayList<>());
 			instance.getUrl().addAll(validUrl);
 		}
-		return Arrays.asList(instance);
+		return instance;
 	}
 	protected String trimAndDecodeUrl(String url) {
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/PatchRelationsApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/PatchRelationsApplication.java
@ -80,9 +80,6 @@ public class PatchRelationsApplication {
 		final Dataset<Relation> rels = readPath(spark, relationPath, Relation.class);
 		final Dataset<RelationIdMapping> idMapping = readPath(spark, idMappingPath, RelationIdMapping.class);
 		log.info("relations: {}", rels.count());
 		log.info("idMapping: {}", idMapping.count());
 		final Dataset<Relation> bySource = rels
 			.joinWith(idMapping, rels.col("source").equalTo(idMapping.col("oldId")), "left")
 			.map((MapFunction<Tuple2<Relation, RelationIdMapping>, Relation>) t -> {
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/enrich_graph_orcid_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/enrich_graph_orcid_parameters.json
@ -22,5 +22,11 @@
    "paramLongName": "targetPath",
    "paramDescription": "the output path of the graph enriched",
    "paramRequired": true
  },
  {
    "paramName": "wp",
    "paramLongName": "workingDir",
    "paramDescription": "the working dir",
    "paramRequired": true
  }
 ]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/oozie_app/workflow.xml
@ -51,6 +51,7 @@
            <arg>--orcidPath</arg><arg>${orcidPath}</arg>
            <arg>--targetPath</arg><arg>${targetPath}</arg>
            <arg>--graphPath</arg><arg>${graphPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
            <arg>--master</arg><arg>yarn</arg>
        </spark>
        <ok to="reset_outputpath"/>
@ -89,6 +90,14 @@
            <arg>${nameNode}/${graphPath}/project</arg>
            <arg>${nameNode}/${targetPath}/project</arg>
        </distcp>
        <ok to="copy_person"/>
        <error to="Kill"/>
    </action>
    <action name="copy_person">
        <distcp xmlns="uri:oozie:distcp-action:0.2">
            <arg>${nameNode}/${graphPath}/person</arg>
            <arg>${nameNode}/${targetPath}/person</arg>
        </distcp>
        <ok to="copy_relation"/>
        <error to="Kill"/>
    </action>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
@ -142,6 +142,7 @@
        <path start="clean_datasource"/>
        <path start="clean_organization"/>
        <path start="clean_project"/>
        <path start="clean_person"/>
        <path start="clean_relation"/>
    </fork>
@ -161,6 +162,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.autoBroadcastJoinThreshold=-1
                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/publication</arg>
@ -196,6 +198,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.autoBroadcastJoinThreshold=-1
                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/dataset</arg>
@ -231,6 +234,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.autoBroadcastJoinThreshold=-1
                --conf spark.sql.shuffle.partitions=5000
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/otherresearchproduct</arg>
@ -266,6 +270,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.autoBroadcastJoinThreshold=-1
                --conf spark.sql.shuffle.partitions=2000
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/software</arg>
@ -301,6 +306,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.autoBroadcastJoinThreshold=-1
                --conf spark.sql.shuffle.partitions=1000
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/datasource</arg>
@ -336,6 +342,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.autoBroadcastJoinThreshold=-1
                --conf spark.sql.shuffle.partitions=1000
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/organization</arg>
@ -371,6 +378,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.autoBroadcastJoinThreshold=-1
                --conf spark.sql.shuffle.partitions=2000
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/project</arg>
@ -390,6 +398,42 @@
        <error to="Kill"/>
    </action>
    <action name="clean_person">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Clean person</name>
            <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.autoBroadcastJoinThreshold=-1
                --conf spark.sql.shuffle.partitions=2000
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/person</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/person</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
            <arg>--contextId</arg><arg>${contextId}</arg>
            <arg>--verifyParam</arg><arg>${verifyParam}</arg>
            <arg>--country</arg><arg>${country}</arg>
            <arg>--verifyCountryParam</arg><arg>${verifyCountryParam}</arg>
            <arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
            <arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
            <arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
            <arg>--deepClean</arg><arg>${shouldClean}</arg>
        </spark>
        <ok to="wait_clean"/>
        <error to="Kill"/>
    </action>
    <action name="clean_relation">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
@ -406,6 +450,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.autoBroadcastJoinThreshold=-1
                --conf spark.sql.shuffle.partitions=20000
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/relation</arg>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
@ -102,6 +102,7 @@
        <path start="import_datasource"/>
        <path start="import_organization"/>
        <path start="import_project"/>
        <path start="import_person"/>
        <path start="import_relation"/>
    </fork>
@ -308,6 +309,35 @@
        <error to="Kill"/>
    </action>
    <action name="import_person">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Import table person</name>
            <class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=1000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/person</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
            <arg>--numPartitions</arg><arg>1000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
    </action>
    <action name="import_relation">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml
@ -68,6 +68,7 @@
        <path start="merge_datasource"/>
        <path start="merge_organization"/>
        <path start="merge_project"/>
        <path start="merge_person"/>
        <path start="merge_relation"/>
    </fork>
@ -260,6 +261,33 @@
        <error to="Kill"/>
    </action>
    <action name="merge_person">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Merge person</name>
            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
            <arg>--betaInputPath</arg><arg>${betaInputGraphPath}/person</arg>
            <arg>--prodInputPath</arg><arg>${prodInputGraphPath}/person</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/person</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
            <arg>--priority</arg><arg>${priority}</arg>
        </spark>
        <ok to="wait_merge"/>
        <error to="Kill"/>
    </action>
    <action name="merge_relation">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml
@ -649,6 +649,7 @@
        <path start="merge_claims_datasource"/>
        <path start="merge_claims_organization"/>
        <path start="merge_claims_project"/>
        <path start="merge_claims_person"/>
        <path start="merge_claims_relation"/>
    </fork>
@ -860,6 +861,32 @@
        <error to="Kill"/>
    </action>
    <action name="merge_claims_person">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>MergeClaims_person</name>
            <class>eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory ${sparkExecutorMemory}
                --executor-cores ${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=200
            </spark-opts>
            <arg>--rawGraphPath</arg><arg>${workingDir}/graph_raw</arg>
            <arg>--claimsGraphPath</arg><arg>${workingDir}/graph_claims</arg>
            <arg>--outputRawGaphPath</arg><arg>${graphOutputPath}</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
        </spark>
        <ok to="wait_merge"/>
        <error to="Kill"/>
    </action>
    <join name="wait_merge" to="decisionPatchRelations"/>
    <decision name="decisionPatchRelations">
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala
@ -47,13 +47,15 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
    log.info(s"orcidPath is '$orcidPath'")
    val targetPath = parser.get("targetPath")
    log.info(s"targetPath is '$targetPath'")
    val workingDir = parser.get("workingDir")
    log.info(s"targetPath is '$workingDir'")
-    createTemporaryData(graphPath, orcidPath, targetPath)
+    createTemporaryData(graphPath, orcidPath, workingDir)
-    analisys(targetPath)
+    analisys(workingDir)
-    generateGraph(graphPath, targetPath)
+    generateGraph(graphPath, workingDir, targetPath)
  }
-  private def generateGraph(graphPath: String, targetPath: String): Unit = {
+  private def generateGraph(graphPath: String, workingDir: String, targetPath: String): Unit = {
    ModelSupport.entityTypes.asScala
      .filter(e => ModelSupport.isResult(e._1))
@ -63,7 +65,7 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
        val matched = spark.read
          .schema(Encoders.bean(classOf[ORCIDAuthorEnricherResult]).schema)
-          .parquet(s"${targetPath}/${resultType}_matched")
+          .parquet(s"${workingDir}/${resultType}_matched")
          .selectExpr("id", "enriched_author")
        spark.read
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala
@ -133,7 +133,7 @@ object SparkCreateInputGraph {
    val ds: Dataset[T] = spark.read.load(sourcePath).as[T]
    ds.groupByKey(_.getId)
-      .mapGroups { (id, it) => MergeUtils.mergeGroup(id, it.asJava).asInstanceOf[T] }
+      .mapGroups { (id, it) => MergeUtils.mergeGroup(it.asJava).asInstanceOf[T] }
 //      .reduceGroups { (x: T, y: T) => MergeUtils.merge(x, y).asInstanceOf[T] }
 //      .map(_)
      .write
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java
@ -30,6 +30,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.common.RelationInverse;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
@ -365,6 +367,40 @@ class MigrateDbEntitiesApplicationTest {
 		assertValidId(r2.getCollectedfrom().get(0).getKey());
 	}
 	@Test
 	void testProcessClaims_affiliation() throws Exception {
 		final List<TypedField> fields = prepareMocks("claimsrel_resultset_affiliation.json");
 		final List<Oaf> list = app.processClaims(rs);
 		assertEquals(2, list.size());
 		verifyMocks(fields);
 		assertTrue(list.get(0) instanceof Relation);
 		assertTrue(list.get(1) instanceof Relation);
 		final Relation r1 = (Relation) list.get(0);
 		final Relation r2 = (Relation) list.get(1);
 		assertValidId(r1.getSource());
 		assertValidId(r1.getTarget());
 		assertValidId(r2.getSource());
 		assertValidId(r2.getTarget());
 		assertNotNull(r1.getDataInfo());
 		assertNotNull(r2.getDataInfo());
 		assertNotNull(r1.getDataInfo().getTrust());
 		assertNotNull(r2.getDataInfo().getTrust());
 		assertEquals(r1.getSource(), r2.getTarget());
 		assertEquals(r2.getSource(), r1.getTarget());
 		assertTrue(StringUtils.isNotBlank(r1.getRelClass()));
 		assertTrue(StringUtils.isNotBlank(r2.getRelClass()));
 		assertTrue(StringUtils.isNotBlank(r1.getRelType()));
 		assertTrue(StringUtils.isNotBlank(r2.getRelType()));
 		assertValidId(r1.getCollectedfrom().get(0).getKey());
 		assertValidId(r2.getCollectedfrom().get(0).getKey());
 	}
 	private List<TypedField> prepareMocks(final String jsonFile) throws IOException, SQLException {
 		final String json = IOUtils.toString(getClass().getResourceAsStream(jsonFile));
 		final ObjectMapper mapper = new ObjectMapper();
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/claimsrel_resultset_affiliation.json
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/claimsrel_resultset_affiliation.json
@ -0,0 +1,27 @@
 [
 	{
 		"field": "source_type",
 		"type": "string",
 		"value": "organization"
 	},
 	{
 		"field": "source_id",
 		"type": "string",
 		"value": "openorgs____::b5ca9d4340e26454e367e2908ef3872f"
 	},
 	{
 		"field": "target_type",
 		"type": "string",
 		"value": "software"
 	},
 	{
 		"field": "target_id",
 		"type": "string",
 		"value": "userclaim___::bde53826d07c8cf47c99222a375cd2e8"
 	},
 	{
 		"field": "semantics",
 		"type": "string",
 		"value": "resultOrganization_affiliation_isAuthorInstitutionOf"
 	}
 ]
--- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala
@ -31,6 +31,7 @@ class ORCIDAuthorMatchersTest {
    assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin"))
    // assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented
  }
  @Test def testDocumentationNames(): Unit = {
    assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller-Jones"))
  }
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.oa.provision;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.util.Comparator;
 import java.util.List;
 import java.util.Objects;
 import java.util.Optional;
@ -167,8 +168,9 @@ public class CreateRelatedEntitiesJob_phase1 {
 					result
 						.getDescription()
 						.stream()
-						.findFirst()
+						.filter(d -> Objects.nonNull(d.getValue()))
 						.map(Field::getValue)
 						.max(Comparator.comparingInt(String::length))
 						.ifPresent(
 							d -> re.setDescription(StringUtils.left(d, ModelHardLimits.MAX_RELATED_ABSTRACT_LENGTH)));
 				}
@ -231,6 +233,14 @@ public class CreateRelatedEntitiesJob_phase1 {
 				if (!f.isEmpty()) {
 					re.setFundingtree(f.stream().map(Field::getValue).collect(Collectors.toList()));
 				}
 				break;
 			case person:
 				final Person person = (Person) entity;
 				re.setGivenName(person.getGivenName());
 				re.setFamilyName(person.getFamilyName());
 				re.setAlternativeNames(person.getAlternativeNames());
 				break;
 		}
 		return re;
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java
@ -2,10 +2,12 @@
 package eu.dnetlib.dhp.oa.provision;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import static eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits.MAX_RELATIONS_BY_RELCLASS;
 import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
 import java.util.Optional;
 import org.apache.commons.io.IOUtils;
@ -15,11 +17,13 @@ import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.*;
 import org.apache.spark.util.LongAccumulator;
 import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.annotation.JsonInclude;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
@ -27,11 +31,13 @@ import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
 import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
 import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
 import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
 import eu.dnetlib.dhp.oa.provision.model.TupleWrapper;
 import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
 import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
 import eu.dnetlib.dhp.schema.oaf.DataInfo;
 import eu.dnetlib.dhp.schema.oaf.Oaf;
 import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
 import eu.dnetlib.dhp.schema.solr.SolrRecord;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -124,6 +130,9 @@ public class PayloadConverterJob {
 					.map(Oaf::getDataInfo)
 					.map(DataInfo::getDeletedbyinference)
 					.orElse(false))
 			.map(
 				(MapFunction<JoinedEntity, JoinedEntity>) PayloadConverterJob::pruneRelatedEntities,
 				Encoders.kryo(JoinedEntity.class))
 			.map(
 				(MapFunction<JoinedEntity, Tuple2<String, SolrRecord>>) je -> new Tuple2<>(
 					recordFactory.build(je, validateXML),
@ -139,6 +148,32 @@ public class PayloadConverterJob {
 			.json(outputPath);
 	}
 	/**
 	 * This function iterates through the RelatedEntityWrapper(s) associated to the JoinedEntity and rules out
 	 * those exceeding the maximum allowed frequency defined in eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits#MAX_RELATIONS_BY_RELCLASS
 	 */
 	private static JoinedEntity pruneRelatedEntities(JoinedEntity je) {
 		Map<String, Long> freqs = Maps.newHashMap();
 		List<RelatedEntityWrapper> rew = Lists.newArrayList();
 		if (je.getLinks() != null) {
 			je.getLinks().forEach(link -> {
 				final String relClass = link.getRelation().getRelClass();
 				final Long count = freqs.getOrDefault(relClass, 0L);
 				final Long max = MAX_RELATIONS_BY_RELCLASS.getOrDefault(relClass, Long.MAX_VALUE);
 				if (count <= max) {
 					rew.add(link);
 					freqs.put(relClass, freqs.getOrDefault(relClass, 0L) + 1);
 				}
 			});
 			je.setLinks(rew);
 		}
 		return je;
 	}
 	private static void removeOutputDir(final SparkSession spark, final String path) {
 		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
 	}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java
@ -23,6 +23,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
 import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
 import eu.dnetlib.dhp.schema.solr.*;
 import eu.dnetlib.dhp.schema.solr.AccessRight;
 import eu.dnetlib.dhp.schema.solr.Author;
@ -37,6 +38,8 @@ import eu.dnetlib.dhp.schema.solr.Measure;
 import eu.dnetlib.dhp.schema.solr.OpenAccessColor;
 import eu.dnetlib.dhp.schema.solr.OpenAccessRoute;
 import eu.dnetlib.dhp.schema.solr.Organization;
 import eu.dnetlib.dhp.schema.solr.Person;
 import eu.dnetlib.dhp.schema.solr.PersonTopic;
 import eu.dnetlib.dhp.schema.solr.Pid;
 import eu.dnetlib.dhp.schema.solr.Project;
 import eu.dnetlib.dhp.schema.solr.Result;
@ -89,6 +92,8 @@ public class ProvisionModelSupport {
 			r.setOrganization(mapOrganization((eu.dnetlib.dhp.schema.oaf.Organization) e));
 		} else if (e instanceof eu.dnetlib.dhp.schema.oaf.Project) {
 			r.setProject(mapProject((eu.dnetlib.dhp.schema.oaf.Project) e, vocs));
 		} else if (e instanceof eu.dnetlib.dhp.schema.oaf.Person) {
 			r.setPerson(mapPerson((eu.dnetlib.dhp.schema.oaf.Person) e));
 		}
 		r
 			.setLinks(
@ -108,7 +113,7 @@ public class ProvisionModelSupport {
 		RelatedRecord rr = new RelatedRecord();
 		final RelatedEntity re = rew.getTarget();
-		final RecordType relatedRecordType = RecordType.valueOf(re.getType());
+		final RecordType relatedRecordType = RecordType.fromString(re.getType());
 		final Relation relation = rew.getRelation();
 		final String relationProvenance = Optional
 			.ofNullable(relation.getDataInfo())
@ -150,6 +155,17 @@ public class ProvisionModelSupport {
 		rr.setPublisher(re.getPublisher());
 		rr.setResulttype(mapQualifier(re.getResulttype()));
 		rr.setTitle(Optional.ofNullable(re.getTitle()).map(StructuredProperty::getValue).orElse(null));
 		rr.setDescription(StringUtils.left(re.getDescription(), ModelHardLimits.MAX_RELATED_ABSTRACT_LENGTH));
 		rr
 			.setAuthor(
 				Optional
 					.ofNullable(re.getAuthor())
 					.map(
 						aa -> aa
 							.stream()
 							.limit(ModelHardLimits.MAX_RELATED_AUTHORS)
 							.collect(Collectors.toList()))
 					.orElse(null));
 		if (relation.getValidated() == null) {
 			relation.setValidated(false);
@ -185,6 +201,18 @@ public class ProvisionModelSupport {
 		return ps;
 	}
 	private static Person mapPerson(eu.dnetlib.dhp.schema.oaf.Person p) {
 		Person ps = new Person();
 		ps.setFamilyName(p.getFamilyName());
 		ps.setGivenName(p.getGivenName());
 		ps.setAlternativeNames(p.getAlternativeNames());
 		ps.setBiography(p.getBiography());
 		ps.setConsent(p.getConsent());
 		// ps.setSubject(...));
 		return ps;
 	}
 	private static Funding mapFunding(List<String> fundingtree, VocabularyGroup vocs) {
 		SAXReader reader = new SAXReader();
 		return Optional
@ -378,6 +406,7 @@ public class ProvisionModelSupport {
 		rs.setPubliclyFunded(r.getPubliclyFunded());
 		rs.setTransformativeAgreement(r.getTransformativeAgreement());
 		rs.setExternalReference(mapExternalReference(r.getExternalReference()));
 		rs.setBestinstancetype(mapQualifier(r.getBestInstancetype()));
 		rs.setInstance(mapInstances(r.getInstance()));
 		if (r instanceof Publication) {
@ -667,14 +696,23 @@ public class ProvisionModelSupport {
 	}
 	private static List<Author> asAuthor(List<eu.dnetlib.dhp.schema.oaf.Author> authorList) {
 		return asAuthor(authorList, ModelHardLimits.MAX_AUTHORS);
 	}
 	private static List<Author> asAuthor(List<eu.dnetlib.dhp.schema.oaf.Author> authorList, int maxAuthors) {
 		return Optional
 			.ofNullable(authorList)
 			.map(
 				authors -> authors
 					.stream()
 					.limit(maxAuthors)
 					.map(
 						a -> Author
-							.newInstance(a.getFullname(), a.getName(), a.getSurname(), a.getRank(), asPid(a.getPid())))
+							.newInstance(
 								StringUtils.left(a.getFullname(), ModelHardLimits.MAX_AUTHOR_FULLNAME_LENGTH),
 								a.getName(),
 								a.getSurname(),
 								a.getRank(), asPid(a.getPid())))
 					.collect(Collectors.toList()))
 			.orElse(null);
 	}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java
@ -51,6 +51,11 @@ public class RelatedEntity implements Serializable {
 	private Qualifier contracttype;
 	private List<String> fundingtree;
 	// person
 	private String givenName;
 	private String familyName;
 	private List<String> alternativeNames;
 	public String getId() {
 		return id;
 	}
@ -251,6 +256,30 @@ public class RelatedEntity implements Serializable {
 		this.fundingtree = fundingtree;
 	}
 	public String getGivenName() {
 		return givenName;
 	}
 	public void setGivenName(String givenName) {
 		this.givenName = givenName;
 	}
 	public String getFamilyName() {
 		return familyName;
 	}
 	public void setFamilyName(String familyName) {
 		this.familyName = familyName;
 	}
 	public List<String> getAlternativeNames() {
 		return alternativeNames;
 	}
 	public void setAlternativeNames(List<String> alternativeNames) {
 		this.alternativeNames = alternativeNames;
 	}
 	@Override
 	public boolean equals(Object o) {
 		if (this == o)
@ -280,7 +309,10 @@ public class RelatedEntity implements Serializable {
 			&& Objects.equal(code, that.code)
 			&& Objects.equal(acronym, that.acronym)
 			&& Objects.equal(contracttype, that.contracttype)
-			&& Objects.equal(fundingtree, that.fundingtree);
+			&& Objects.equal(fundingtree, that.fundingtree)
 			&& Objects.equal(givenName, that.givenName)
 			&& Objects.equal(familyName, that.familyName)
 			&& Objects.equal(alternativeNames, that.alternativeNames);
 	}
 	@Override
@ -309,6 +341,9 @@ public class RelatedEntity implements Serializable {
 				code,
 				acronym,
 				contracttype,
-				fundingtree);
+				fundingtree,
 				familyName,
 				givenName,
 				alternativeNames);
 	}
 }
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
@ -1035,6 +1035,48 @@ public class XmlRecordFactory implements Serializable {
 								.collect(Collectors.toList()));
 				}
 				break;
 			case person:
 				final Person person = (Person) entity;
 				if (person.getGivenName() != null) {
 					metadata.add(XmlSerializationUtils.asXmlElement("givenname", person.getGivenName()));
 				}
 				if (person.getFamilyName() != null) {
 					metadata.add(XmlSerializationUtils.asXmlElement("familyname", person.getFamilyName()));
 				}
 				if (person.getAlternativeNames() != null) {
 					metadata
 						.addAll(
 							person
 								.getAlternativeNames()
 								.stream()
 								.map(altName -> XmlSerializationUtils.asXmlElement("alternativename", altName))
 								.collect(Collectors.toList()));
 				}
 				if (person.getBiography() != null) {
 					metadata.add(XmlSerializationUtils.asXmlElement("biography", person.getBiography()));
 				}
 				if (person.getSubject() != null) {
 					metadata
 						.addAll(
 							person
 								.getSubject()
 								.stream()
 								.map(pt -> {
 									List<Tuple2<String, String>> attrs = Lists.newArrayList();
 									attrs.add(new Tuple2<>("schema", pt.getSchema()));
 									attrs.add(new Tuple2<>("value", pt.getValue()));
 									attrs.add(new Tuple2<>("fromYear", String.valueOf(pt.getFromYear())));
 									attrs.add(new Tuple2<>("toYear", String.valueOf(pt.getToYear())));
 									return XmlSerializationUtils.asXmlElement("subject", attrs);
 								})
 								.collect(Collectors.toList()));
 				}
 				if (person.getConsent() != null) {
 					metadata.add(XmlSerializationUtils.asXmlElement("consent", String.valueOf(person.getConsent())));
 				}
 				break;
 			default:
 				throw new IllegalArgumentException("invalid entity type: " + type);
@ -1240,6 +1282,25 @@ public class XmlRecordFactory implements Serializable {
 								.collect(Collectors.toList()));
 				}
 				break;
 			case person:
 				if (isNotBlank(re.getGivenName())) {
 					metadata.add(XmlSerializationUtils.asXmlElement("givenname", re.getGivenName()));
 				}
 				if (isNotBlank(re.getFamilyName())) {
 					metadata.add(XmlSerializationUtils.asXmlElement("familyname", re.getFamilyName()));
 				}
 				if (re.getAlternativeNames() != null && !re.getAlternativeNames().isEmpty()) {
 					metadata
 						.addAll(
 							re
 								.getAlternativeNames()
 								.stream()
 								.map(name -> XmlSerializationUtils.asXmlElement("alternativename", name))
 								.collect(Collectors.toList()));
 				}
 				break;
 			default:
 				throw new IllegalArgumentException("invalid target type: " + targetType);
 		}
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
@ -180,6 +180,7 @@
        <path start="join_relation_datasource"/>
        <path start="join_relation_organization"/>
        <path start="join_relation_project"/>
        <path start="join_relation_person"/>
    </fork>
    <action name="join_relation_publication">
@ -378,6 +379,34 @@
        <error to="Kill"/>
    </action>
    <action name="join_relation_person">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Join[relation.target = person.id]</name>
            <class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1</class>
            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCoresForJoining}
                --executor-memory=${sparkExecutorMemoryForJoining}
                --driver-memory=${sparkDriverMemoryForJoining}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=5000
                --conf spark.network.timeout=${sparkNetworkTimeout}
            </spark-opts>
            <arg>--inputRelationsPath</arg><arg>${workingDir}/relation</arg>
            <arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/person</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
            <arg>--outputPath</arg><arg>${workingDir}/join_partial/person</arg>
        </spark>
        <ok to="wait_joins"/>
        <error to="Kill"/>
    </action>
    <join name="wait_joins" to="fork_join_all_entities"/>
    <fork name="fork_join_all_entities">
@ -388,6 +417,7 @@
        <path start="join_datasource_relations"/>
        <path start="join_organization_relations"/>
        <path start="join_project_relations"/>
        <path start="join_person_relations"/>
    </fork>
    <action name="join_publication_relations">
@ -593,6 +623,35 @@
        <error to="Kill"/>
    </action>
    <action name="join_person_relations">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Join[person.id = relatedEntity.source]</name>
            <class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2</class>
            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCoresForJoining}
                --executor-memory=${sparkExecutorMemoryForJoining}
                --driver-memory=${sparkDriverMemoryForJoining}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=5000
                --conf spark.network.timeout=${sparkNetworkTimeout}
            </spark-opts>
            <arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/person</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
            <arg>--inputRelatedEntitiesPath</arg><arg>${workingDir}/join_partial</arg>
            <arg>--outputPath</arg><arg>${workingDir}/join_entities/person</arg>
            <arg>--numPartitions</arg><arg>10000</arg>
        </spark>
        <ok to="wait_join_phase2"/>
        <error to="Kill"/>
    </action>
    <join name="wait_join_phase2" to="create_payloads"/>
    <action name="create_payloads">
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_score_limits.sh
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_score_limits.sh
@ -1,63 +0,0 @@
 #/usr/bin/bash
 # Read log files from ranking scripts and create a two-line file  
 # with score limits for the various measures. To be used by Kleanthis
 attrank_file=$(ls *attrank*.log);
 pr_file=$(ls *pagerank*.log)
 ram_file=$(ls *ram*.log);
 cc_file=$(ls *cc*.log);
 impulse_file=$(ls *impulse*.log);
 echo
 echo "-----------------------------"
 echo "Attrank file:${attrank_file}";
 echo "PageRank file:${pr_file}";
 echo "RAM file:${ram_file}";
 echo "CC file:${cc_file}";
 echo "Impulse file:${impulse_file}";
 echo "-----------------------------"
 echo
 echo
 # output file will be called score_limits.csv
 echo -e "influence_top001\tinfluence_top01\tinfluence_top1\tinfluence_top10\tpopularity_top001\tpopularity_top01\tpopularity_top1\tpopularity_top10\timpulse_top001\timpulse_top01\timpulse_top1\timpulse_top10\tcc_top001\tcc_top01\tcc_top1\tcc_top10" > score_limits.csv
 # ---------------------------------------------------- #
 # Get respective score limits (we don't need RAM)
 inf_001=$(grep "^0.01%" ${pr_file} | cut -f 2);
 inf_01=$(grep "^0.1%" ${pr_file} | cut -f 2);
 inf_1=$(grep "^1%" ${pr_file} | cut -f 2);
 inf_10=$(grep "^10%" ${pr_file} | cut -f 2);
 echo "Influnence limits:"
 echo -e "${inf_001}\t${inf_01}\t${inf_1}\t${inf_10}";
 # ---------------------------------------------------- #
 pop_001=$(grep "^0.01%" ${attrank_file} | cut -f 2);
 pop_01=$(grep "^0.1%" ${attrank_file} | cut -f 2);
 pop_1=$(grep "^1%" ${attrank_file} | cut -f 2);
 pop_10=$(grep "^10%" ${attrank_file} | cut -f 2);
 echo "Popularity limits:";
 echo -e "${pop_001}\t${pop_01}\t${pop_1}\t${pop_10}";
 # ---------------------------------------------------- #
 imp_001=$(grep "^0.01%" ${impulse_file} | cut -f 2);
 imp_01=$(grep "^0.1%" ${impulse_file} | cut -f 2);
 imp_1=$(grep "^1%" ${impulse_file} | cut -f 2);
 imp_10=$(grep "^10%" ${impulse_file} | cut -f 2);
 echo "Popularity limits:";
 echo -e "${imp_001}\t${imp_01}\t${imp_1}\t${imp_10}";
 # ---------------------------------------------------- #
 cc_001=$(grep "^0.01%" ${cc_file} | cut -f 2);
 cc_01=$(grep "^0.1%" ${cc_file} | cut -f 2);
 cc_1=$(grep "^1%" ${cc_file} | cut -f 2);
 cc_10=$(grep "^10%" ${cc_file} | cut -f 2);
 echo "Popularity limits:";
 echo -e "${cc_001}\t${cc_01}\t${cc_1}\t${cc_10}";
 # ---------------------------------------------------- #
 echo -e "${inf_001}\t${inf_01}\t${inf_1}\t${inf_10}\t${pop_001}\t${pop_01}\t${pop_1}\t${pop_10}\t${imp_001}\t${imp_01}\t${imp_1}\t${imp_10}\t${cc_001}\t${cc_01}\t${cc_1}\t${cc_10}" >> score_limits.csv
 echo
 echo "score_limits.csv contents:"
 cat score_limits.csv
 echo;
 echo;
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_openaire_ids_to_dois.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_openaire_ids_to_dois.py
@ -1,60 +0,0 @@
 import json
 import sys
 from pyspark.sql import SparkSession
 from pyspark import SparkConf, SparkContext
 if len(sys.argv) != 3:
    print("Usage: map_openaire_ids_to_dois.py <hdfs_src_dir> <hdfs_output_dir>")
    sys.exit(-1)
 conf = SparkConf().setAppName('BIP!: Map OpenAIRE IDs to DOIs')
 sc = SparkContext(conf = conf)
 spark = SparkSession.builder.appName('BIP!: Map OpenAIRE IDs to DOIs').getOrCreate()
 sc.setLogLevel('OFF')
 src_dir = sys.argv[1]
 output = sys.argv[2]
 # src_dir = "/tmp/beta_provision/graph/21_graph_cleaned/"
 # output = '/tmp/openaireid_to_dois/'
 def transform(doc):
    # get publication year from 'doc.dateofacceptance.value'
    dateofacceptance = doc.get('dateofacceptance', {}).get('value')
    year = 0 
    if (dateofacceptance is not None):
        year = dateofacceptance.split('-')[0]
    # for each pid get 'pid.value' if 'pid.qualifier.classid' equals to 'doi'
    dois = [ pid['value'] for pid in doc.get('pid', [])  if (pid.get('qualifier', {}).get('classid') == 'doi' and pid['value'] is not None)]
    num_dois = len(dois)
    # exlcude openaire ids that do not correspond to DOIs
    if (num_dois == 0): 
        return None
    fields = [ doc['id'], str(num_dois), chr(0x02).join(dois), str(year) ]
    return '\t'.join([ v.encode('utf-8') for v in fields ])
 docs = None
 for result_type in ["publication", "dataset", "software", "otherresearchproduct"]:
    tmp = sc.textFile(src_dir + result_type).map(json.loads)
    if (docs is None):
        docs = tmp
    else:
        # append all result types in one RDD
        docs = docs.union(tmp)
 docs = docs.filter(lambda d: d.get('dataInfo', {}).get('deletedbyinference') == False and d.get('dataInfo', {}).get('invisible') == False)
 docs = docs.map(transform).filter(lambda d: d is not None)
 docs.saveAsTextFile(output)
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py
@ -1,168 +0,0 @@
 #!/usr/bin/python
 # This program reads the openaire to doi mapping from the ${synonymFolder} of the workflow
 # and uses this mapping to create doi-based score files in the format required by BiP! DB.
 # This is done by reading each openaire-id based ranking file and joining the openaire based
 # score and classes to all the corresponding dois.
 #################################################################################################
 # Imports
 import sys
 # Sparksession lib to communicate with cluster via session object
 from pyspark.sql import SparkSession
 # Import sql types to define schemas
 from pyspark.sql.types import *
 # Import sql functions with shorthand alias
 import pyspark.sql.functions as F
 from pyspark.sql.functions import max
 # from pyspark.sql.functions import udf
 #################################################################################################
 #################################################################################################
 # Clean up directory name - no longer needed in final workflow version
 '''
 def clean_directory_name(dir_name):
    # We have a name with the form *_bip_universe<digits>_* or *_graph_universe<digits>_* 
    # and we need to keep the parts in *	
    dir_name_parts = dir_name.split('_')
    dir_name_parts = [part for part in dir_name_parts if ('bip' not in part and 'graph' not in part and 'universe' not in part and 'from' not in part)]
    dir_name = dir_name.replace("openaire_id_graph", "openaire_ids")
    clean_name = dir_name + ".txt.gz"
    # clean_name = '_'.join(dir_name_parts)
    # if '_ids' not in clean_name:
    #     clean_name = clean_name.replace('id_', 'ids_')
    # clean_name = clean_name.replace('.txt', '')
    # clean_name = clean_name.replace('.gz', '')
    # if 'openaire_ids_' in clean_name:
    #     clean_name = clean_name.replace('openaire_ids_', '')
        # clean_name = clean_name + '.txt.gz'
    # else:
        # clean_name = clean_name + '.txt.gz'
    return clean_name
 '''
 #################################################################################################
 if len(sys.argv) < 3:
    print ("Usage: ./map_scores_to_dois.py <synonym_folder> <num_partitions> <score_file_1> <score_file_2> <...etc...>")
    sys.exit(-1)
 # Read arguments
 synonyms_folder = sys.argv[1]
 num_partitions = int(sys.argv[2])
 input_file_list = [argument.replace("_openaire_id_graph", "").replace("_openaire_id_graph_", "") + "_openaire_ids.txt.gz" for argument in sys.argv[3:]]
 # input_file_list = [clean_directory_name(item) for item in input_file_list]
 # Prepare output specific variables
 output_file_list = [item.replace("_openaire_ids", "") for item in input_file_list]
 output_file_list = [item + ".txt.gz" if not item.endswith(".txt.gz") else item for item in output_file_list]
 # --- INFO MESSAGES --- #
 print ("\n\n----------------------------")
 print ("Mpping openaire ids to DOIs")
 print ("Reading input from: " + synonyms_folder)
 print ("Num partitions: " + str(num_partitions))
 print ("Input files:" + " -- ".join(input_file_list))
 print ("Output files: " + " -- ".join(output_file_list))
 print ("----------------------------\n\n")
 #######################################################################################
 # We weill define the following schemas:
 # --> the schema of the openaire - doi mapping file [string - int - doi_list] (the separator of the doi-list is a non printable character)
 # --> a schema for floating point ranking scores [string - float - string]  (the latter string is the class)
 # --> a schema for integer ranking scores [string - int - string]  (the latter string is the class)
 float_schema = StructType([
 	StructField('id', StringType(), False),
 	StructField('score', FloatType(), False),
 	StructField('class', StringType(), False)
 	])
 int_schema = StructType([
 	StructField('id', StringType(), False),
 	StructField('score', IntegerType(), False),
 	StructField('class', StringType(), False)
 	])
 # This schema concerns the output of the file
 # containing the number of references of each doi
 synonyms_schema = StructType([
 	StructField('id', StringType(), False),
 	StructField('num_synonyms', IntegerType(), False),
    StructField('doi_list', StringType(), False),
 	])
 #######################################################################################
 # Start spark session
 spark = SparkSession.builder.appName('Map openaire scores to DOIs').getOrCreate()
 # Set Log Level for spark session
 spark.sparkContext.setLogLevel('WARN')
 #######################################################################################
 # MAIN Program
 # Read and repartition the synonym folder - also cache it since we will need to perform multiple joins
 synonym_df = spark.read.schema(synonyms_schema).option('delimiter', '\t').csv(synonyms_folder)
 synonym_df = synonym_df.select('id',  F.split(F.col('doi_list'), chr(0x02)).alias('doi_list'))
 synonym_df = synonym_df.select('id', F.explode('doi_list').alias('doi')).repartition(num_partitions, 'id').cache()
 # TESTING
 # print ("Synonyms: " + str(synonym_df.count()))
 # print ("DF looks like this:" )
 # synonym_df.show(1000, False)
 print ("\n\n-----------------------------")
 # Now we need to join the score files on the openaire-id with the synonyms and then keep
 # only doi - score - class and write this to the output
 for offset, input_file in enumerate(input_file_list):
    print ("Mapping scores from " + input_file)
    # Select correct schema
    schema = int_schema
    if "attrank" in input_file.lower() or "pr" in input_file.lower() or "ram" in input_file.lower():
        schema = float_schema
    # Load file to dataframe
    ranking_df = spark.read.schema(schema).option('delimiter', '\t').csv(input_file).repartition(num_partitions, 'id')
    # Get max score
    max_score = ranking_df.select(max('score').alias('max')).collect()[0]['max']
    print ("Max Score for " + str(input_file) + " is " + str(max_score))
    # TESTING
    # print ("Loaded df sample:")
    # ranking_df.show(1000, False)
    # Join scores to synonyms and keep required fields
    doi_score_df = synonym_df.join(ranking_df, ['id']).select('doi', 'score', 'class').repartition(num_partitions, 'doi').cache()
    # Write output
    output_file = output_file_list[offset]
    print ("Writing to: " + output_file)
    doi_score_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_file, compression='gzip')
    # Creata another file for the bip update process
    ranking_df = ranking_df.select('id', 'score', F.lit(F.col('score')/max_score).alias('normalized_score'), 'class', F.col('class').alias('class_dup'))
    doi_score_df = synonym_df.join(ranking_df, ['id']).select('doi', 'score', 'normalized_score', 'class', 'class_dup').repartition(num_partitions, 'doi').cache()
    output_file = output_file.replace(".txt.gz", "_for_bip_update.txt.gz")
    print ("Writing bip update to: " + output_file)
    doi_score_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_file, compression='gzip')
    # Free memory?
    ranking_df.unpersist(True)
 print ("-----------------------------")
 print ("\n\nFinished!\n\n")
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@ -17,10 +17,6 @@
 				<name>openaireGraphInputPath</name>
 				<value>${nameNode}/${workingDir}/openaire_id_graph</value>
 			</property>
 			<property>
 				<name>synonymFolder</name>
 				<value>${nameNode}/${workingDir}/openaireid_to_dois/</value>
 			</property>
 			<property>
 				<name>checkpointDir</name>
 				<value>${nameNode}/${workingDir}/check/</value>
@ -32,29 +28,34 @@
 		</configuration>
 	</global>
-	<!-- start using a decision node, so as to determine from which point onwards a job will continue -->
+	<!-- Start using a decision node, to determine from which point onwards a job will continue -->
 	<start to="entry-point-decision" />
 	<decision name="entry-point-decision">
 		<switch>
-			<!-- The default will be set as the normal start, a.k.a. get-doi-synonyms -->
+
-			<!-- If any different condition is set, go to the corresponding start -->
+			<!-- Start from creating the citation network (i.e., normal execution should start from here) -->
 			<case to="create-openaire-ranking-graph">${wf:conf('resume') eq "start"}</case>
 			<!-- Different citation-based impact indicators are computed -->
 			<case to="spark-cc">${wf:conf('resume') eq "cc"}</case>
 			<case to="spark-ram">${wf:conf('resume') eq "ram"}</case>
 			<case to="spark-impulse">${wf:conf('resume') eq "impulse"}</case>
 			<case to="spark-pagerank">${wf:conf('resume') eq "pagerank"}</case>
 			<case to="spark-attrank">${wf:conf('resume') eq "attrank"}</case>
 			<!-- <case to="iterative-rankings">${wf:conf('resume') eq "rankings-iterative"}</case> -->
 			<case to="get-file-names">${wf:conf('resume') eq "format-results"}</case>
 			<case to="map-openaire-to-doi">${wf:conf('resume') eq "map-ids"}</case>
 			<case to="map-scores-to-dois">${wf:conf('resume') eq "map-scores"}</case>
 			<case to="create-openaire-ranking-graph">${wf:conf('resume') eq "start"}</case>
-			<!-- Aggregation of impact scores on the project level		-->
+			<!-- Format the results appropriately before transforming them to action sets -->
 			<case to="get-file-names">${wf:conf('resume') eq "format-results"}</case>
 			<!-- Aggregation of impact scores on the project level -->
 			<case to="project-impact-indicators">${wf:conf('resume') eq "projects-impact"}</case>
 			<!-- Create action sets -->
 			<case to="create-actionset">${wf:conf('resume') eq "create-actionset"}</case>
 			<!-- The default will be set as the normal start, a.k.a. create-openaire-ranking-graph -->
 			<default to="create-openaire-ranking-graph" />
 		</switch>
 	</decision>
@ -295,18 +296,11 @@
 			<capture-output/>
 		</shell>
-		<ok to="format-result-files" />
+		<ok to="format-json-files" />
 		<error to="filename-getting-error" />
 	</action>
 	<!-- Now we will run in parallel the formatting of ranking files for BiP! DB and openaire (json files) -->
 	<fork name="format-result-files">
 		<path start="format-bip-files"/>
 		<path start="format-json-files"/>
 	</fork>
 	<!-- Format json files -->
 	<!-- Two parts: a) format files b) make the file endings .json.gz -->
 	<action name="format-json-files">
@ -345,139 +339,8 @@
 			<file>${wfAppPath}/format_ranking_results.py#format_ranking_results.py</file>
 		</spark>
 		<ok to="join-file-formatting" />
 		<error to="json-formatting-fail" />
 	</action>
 	<!-- This is the second line of parallel workflow execution where we create the BiP! DB files -->
 	<action name="format-bip-files">
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
 			<!-- using configs from an example on openaire -->
 			<master>yarn-cluster</master>
 			<mode>cluster</mode>
 			<!-- This is the name of our job -->
 			<name>Format Ranking Results BiP! DB</name>
 			<!-- Script name goes here -->
 			<jar>format_ranking_results.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
 			<spark-opts>
 				--executor-memory=${sparkNormalExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
 				--driver-memory=${sparkNormalDriverMemory}
 				--conf spark.executor.memoryOverhead=${sparkNormalExecutorMemory}
 				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
 				--conf spark.extraListeners=${spark2ExtraListeners}
 				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 			</spark-opts>
 			<!-- Script arguments here -->
 			<arg>zenodo</arg>
 			<!-- Input files must be identified dynamically -->
 			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
 			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
 			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
 			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
 			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
 			<!-- Num partitions -->
 			<arg>${sparkShufflePartitions}</arg>
 			<!-- Type of data to be produced [bip (dois) / openaire (openaire-ids) ] -->
 			<arg>openaire</arg>
 			<!-- This needs to point to the file on the hdfs i think -->
 			<file>${wfAppPath}/format_ranking_results.py#format_ranking_results.py</file>
 		</spark>
 		<ok to="join-file-formatting" />
 		<error to="bip-formatting-fail" />
 	</action>
 	<!-- Finish formatting jobs -->
 	<join name="join-file-formatting" to="map-openaire-to-doi"/>
 	<!-- maps openaire ids to DOIs -->
 	<action name="map-openaire-to-doi">
 		<spark xmlns="uri:oozie:spark-action:0.2">
 			<!-- Delete previously created doi synonym folder -->
 			<prepare>
 				<delete path="${synonymFolder}"/>
 			</prepare>
 			<master>yarn-cluster</master>
 			<mode>cluster</mode>
 			<name>Openaire-DOI synonym collection</name>
 			<jar>map_openaire_ids_to_dois.py</jar>
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
 				--driver-memory=${sparkHighDriverMemory}
 				--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
 				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
 				--conf spark.extraListeners=${spark2ExtraListeners}
 				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 			</spark-opts>
 			<!-- Script arguments here -->
 			<arg>${openaireDataInput}/</arg>
 			<!-- number of partitions to be used on joins -->
 			<arg>${synonymFolder}</arg>
 			<file>${wfAppPath}/map_openaire_ids_to_dois.py#map_openaire_ids_to_dois.py</file>
 		</spark>
 		<ok to="map-scores-to-dois" />
 		<error to="synonym-collection-fail" />
 	</action>
 	<!-- mapping openaire scores to DOIs -->
 	<action name="map-scores-to-dois">
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
 			<!-- using configs from an example on openaire -->
 			<master>yarn-cluster</master>
 			<mode>cluster</mode>
 			<name>Mapping Openaire Scores to DOIs</name>
 			<jar>map_scores_to_dois.py</jar>
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
 				--driver-memory=${sparkHighDriverMemory}
 				--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
 				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
 				--conf spark.extraListeners=${spark2ExtraListeners}
 				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 			</spark-opts>
 			<!-- Script arguments here -->
 			<arg>${synonymFolder}</arg>
 			<!-- Number of partitions -->
 			<arg>${sparkShufflePartitions}</arg>
 			<!-- The remaining input are the ranking files fproduced for bip db-->
 			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
 			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
 			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
 			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
 			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
 			<file>${wfAppPath}/map_scores_to_dois.py#map_scores_to_dois.py</file>
 		</spark>
 		<ok to="project-impact-indicators" />
-		<error to="map-scores-fail" />
+		<error to="json-formatting-fail" />
 	</action>
 	<action name="project-impact-indicators">
@ -594,18 +457,6 @@
 		<message>Error formatting json files, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>
 	<kill name="bip-formatting-fail">
 		<message>Error formatting BIP files, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>
 	<kill name="synonym-collection-fail">
 		<message>Synonym collection failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>
 	<kill name="map-scores-fail">
 		<message>Mapping scores to DOIs failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>
 	<kill name="actionset-delete-fail">
 		<message>Deleting output path for actionsets failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>
--- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/models/LastVisitData.java
+++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/models/LastVisitData.java
@ -3,8 +3,8 @@ package eu.dnetlib.dhp.swh.models;
 import java.io.Serializable;
 import com.cloudera.com.fasterxml.jackson.annotation.JsonProperty;
 import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
 import com.fasterxml.jackson.annotation.JsonProperty;
@JsonIgnoreProperties(ignoreUnknown = true)
 public class LastVisitData implements Serializable {
--- a/pom.xml
+++ b/pom.xml
@ -937,7 +937,7 @@
        <commons.logging.version>1.1.3</commons.logging.version>
        <commons-validator.version>1.7</commons-validator.version>
        <dateparser.version>1.0.7</dateparser.version>
-        <dhp-schemas.version>[8.0.1]</dhp-schemas.version>
+        <dhp-schemas.version>[9.0.0]</dhp-schemas.version>
        <dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
        <dhp.commons.lang.version>3.5</dhp.commons.lang.version>
        <dhp.guava.version>11.0.2</dhp.guava.version>